10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
22 #include <Kokkos_ArithTraits.hpp>
23 #include <KokkosBatched_Util.hpp>
24 #include <KokkosBatched_Vector.hpp>
25 #include <KokkosBatched_Copy_Decl.hpp>
26 #include <KokkosBatched_Copy_Impl.hpp>
27 #include <KokkosBatched_AddRadial_Decl.hpp>
28 #include <KokkosBatched_AddRadial_Impl.hpp>
29 #include <KokkosBatched_SetIdentity_Decl.hpp>
30 #include <KokkosBatched_SetIdentity_Impl.hpp>
31 #include <KokkosBatched_Gemm_Decl.hpp>
32 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
33 #include <KokkosBatched_Gemm_Team_Impl.hpp>
34 #include <KokkosBatched_Gemv_Decl.hpp>
35 #include <KokkosBatched_Gemv_Team_Impl.hpp>
36 #include <KokkosBatched_Trsm_Decl.hpp>
37 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
38 #include <KokkosBatched_Trsm_Team_Impl.hpp>
39 #include <KokkosBatched_Trsv_Decl.hpp>
40 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
41 #include <KokkosBatched_Trsv_Team_Impl.hpp>
42 #include <KokkosBatched_LU_Decl.hpp>
43 #include <KokkosBatched_LU_Serial_Impl.hpp>
44 #include <KokkosBatched_LU_Team_Impl.hpp>
46 #include <KokkosBlas1_nrm1.hpp>
47 #include <KokkosBlas1_nrm2.hpp>
51 #include "Ifpack2_BlockHelper.hpp"
52 #include "Ifpack2_BlockComputeResidualVector.hpp"
53 #include "Ifpack2_BlockComputeResidualAndSolve.hpp"
60 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
61 #include "cuda_profiler_api.h"
66 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
74 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
78 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
81 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
82 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
86 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
90 namespace BlockTriDiContainerDetails {
92 namespace KB = KokkosBatched;
99 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
100 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
101 MemoryTraitsType::is_random_access |
104 template <
typename ViewType>
105 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
106 typename ViewType::array_layout,
107 typename ViewType::device_type,
108 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
109 template <
typename ViewType>
110 using Atomic = Kokkos::View<
typename ViewType::data_type,
111 typename ViewType::array_layout,
112 typename ViewType::device_type,
113 MemoryTraits<typename ViewType::memory_traits, Kokkos::Atomic>>;
114 template <
typename ViewType>
115 using Const = Kokkos::View<
typename ViewType::const_data_type,
116 typename ViewType::array_layout,
117 typename ViewType::device_type,
118 typename ViewType::memory_traits>;
119 template <
typename ViewType>
120 using ConstUnmanaged = Const<Unmanaged<ViewType>>;
122 template <
typename ViewType>
123 using AtomicUnmanaged = Atomic<Unmanaged<ViewType>>;
125 template <
typename ViewType>
126 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
127 typename ViewType::array_layout,
128 typename ViewType::device_type,
129 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
131 template <
typename ViewType>
132 using Scratch = Kokkos::View<
typename ViewType::data_type,
133 typename ViewType::array_layout,
134 typename ViewType::execution_space::scratch_memory_space,
135 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
140 template <
typename T>
142 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
148 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
149 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
150 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
152 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
153 { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStop()); }
155 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
157 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
163 template <
typename MatrixType>
166 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
168 using tpetra_map_type =
typename impl_type::tpetra_map_type;
169 using tpetra_mv_type =
typename impl_type::tpetra_block_multivector_type;
170 using tpetra_import_type =
typename impl_type::tpetra_import_type;
171 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
172 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
174 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
175 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
177 bool hasBlockCrsMatrix = !A_bcrs.is_null();
180 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
182 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
183 const auto src =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
184 const auto tgt =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap(), blocksize)));
185 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
194 template <
typename MatrixType>
195 struct AsyncableImport {
203 #if !defined(HAVE_IFPACK2_MPI)
204 typedef int MPI_Request;
205 typedef int MPI_Comm;
207 using scalar_type =
typename impl_type::scalar_type;
211 static int isend(
const MPI_Comm comm,
const char *buf,
int count,
int dest,
int tag, MPI_Request *ireq) {
212 #ifdef HAVE_IFPACK2_MPI
214 int ret = MPI_Isend(const_cast<char *>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
215 if (ireq == NULL) MPI_Request_free(&ureq);
222 static int irecv(
const MPI_Comm comm,
char *buf,
int count,
int src,
int tag, MPI_Request *ireq) {
223 #ifdef HAVE_IFPACK2_MPI
225 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
226 if (ireq == NULL) MPI_Request_free(&ureq);
233 static int waitany(
int count, MPI_Request *reqs,
int *index) {
234 #ifdef HAVE_IFPACK2_MPI
235 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
241 static int waitall(
int count, MPI_Request *reqs) {
242 #ifdef HAVE_IFPACK2_MPI
243 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
250 using tpetra_map_type =
typename impl_type::tpetra_map_type;
251 using tpetra_import_type =
typename impl_type::tpetra_import_type;
253 using local_ordinal_type =
typename impl_type::local_ordinal_type;
254 using global_ordinal_type =
typename impl_type::global_ordinal_type;
258 using int_1d_view_host = Kokkos::View<int *, Kokkos::HostSpace>;
259 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type *, Kokkos::HostSpace>;
261 using execution_space =
typename impl_type::execution_space;
262 using memory_space =
typename impl_type::memory_space;
263 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
265 using size_type_1d_view_host = Kokkos::View<size_type *, Kokkos::HostSpace>;
267 #if defined(KOKKOS_ENABLE_CUDA)
268 using impl_scalar_type_1d_view =
269 typename std::conditional<std::is_same<execution_space, Kokkos::Cuda>::value,
270 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
271 Kokkos::View<impl_scalar_type *, Kokkos::CudaHostPinnedSpace>,
272 #elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
273 Kokkos::View<impl_scalar_type *, Kokkos::CudaSpace>,
274 #else // no experimental macros are defined
275 typename impl_type::impl_scalar_type_1d_view,
277 typename impl_type::impl_scalar_type_1d_view>::type;
279 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
281 using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type *, Kokkos::HostSpace>;
282 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
283 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
285 #ifdef HAVE_IFPACK2_MPI
289 impl_scalar_type_2d_view_tpetra remote_multivector;
290 local_ordinal_type blocksize;
292 template <
typename T>
293 struct SendRecvPair {
298 SendRecvPair<int_1d_view_host> pids;
299 SendRecvPair<std::vector<MPI_Request>> reqs;
300 SendRecvPair<size_type_1d_view> offset;
301 SendRecvPair<size_type_1d_view_host> offset_host;
302 SendRecvPair<local_ordinal_type_1d_view> lids;
303 SendRecvPair<impl_scalar_type_1d_view> buffer;
304 SendRecvPair<impl_scalar_type_1d_view_host> buffer_host;
306 local_ordinal_type_1d_view dm2cm;
308 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
309 using exec_instance_1d_std_vector = std::vector<execution_space>;
310 exec_instance_1d_std_vector exec_instances;
316 const size_type_1d_view &offs) {
318 Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.
getRawPtr()), lens.
size());
319 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
322 const Kokkos::RangePolicy<execution_space> policy(0, offs.extent(0));
323 const local_ordinal_type lens_size = lens_device.extent(0);
324 Kokkos::parallel_scan(
325 "AsyncableImport::RangePolicy::setOffsetValues",
326 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
329 update += (i < lens_size ? lens_device[i] : 0);
334 const size_type_1d_view_host &offs) {
336 Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.
getRawPtr()), lens.
size());
337 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
341 for (local_ordinal_type i = 1, iend = offs.extent(0); i < iend; ++i) {
342 offs(i) = offs(i - 1) + lens[i - 1];
347 void createMpiRequests(
const tpetra_import_type &
import) {
348 Tpetra::Distributor &distributor =
import.getDistributor();
351 const auto pids_from = distributor.getProcsFrom();
353 memcpy(pids.recv.data(), pids_from.getRawPtr(),
sizeof(int) * pids.recv.extent(0));
355 const auto pids_to = distributor.getProcsTo();
357 memcpy(pids.send.data(), pids_to.getRawPtr(),
sizeof(int) * pids.send.extent(0));
360 reqs.recv.resize(pids.recv.extent(0));
361 memset(reqs.recv.data(), 0, reqs.recv.size() *
sizeof(MPI_Request));
362 reqs.send.resize(pids.send.extent(0));
363 memset(reqs.send.data(), 0, reqs.send.size() *
sizeof(MPI_Request));
367 const auto lengths_to = distributor.getLengthsTo();
370 const auto lengths_from = distributor.getLengthsFrom();
373 setOffsetValues(lengths_to, offset.send);
374 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
376 setOffsetValues(lengths_from, offset.recv);
377 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
379 const auto lengths_to = distributor.getLengthsTo();
380 offset_host.send = size_type_1d_view_host(
do_not_initialize_tag(
"offset send"), lengths_to.size() + 1);
382 const auto lengths_from = distributor.getLengthsFrom();
383 offset_host.recv = size_type_1d_view_host(
do_not_initialize_tag(
"offset recv"), lengths_from.size() + 1);
385 setOffsetValuesHost(lengths_to, offset_host.send);
388 setOffsetValuesHost(lengths_from, offset_host.recv);
393 void createSendRecvIDs(
const tpetra_import_type &
import) {
395 const auto remote_lids =
import.getRemoteLIDs();
396 const local_ordinal_type_1d_view_host
397 remote_lids_view_host(const_cast<local_ordinal_type *>(remote_lids.getRawPtr()), remote_lids.size());
399 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
402 auto epids =
import.getExportPIDs();
403 auto elids =
import.getExportLIDs();
406 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
409 for (local_ordinal_type cnt = 0, i = 0, iend = pids.send.extent(0); i < iend; ++i) {
410 const auto pid_send_value = pids.send[i];
411 for (local_ordinal_type j = 0, jend = epids.size(); j < jend; ++j)
412 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
413 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i + 1]);
415 Kokkos::deep_copy(lids.send, lids_send_host);
418 void createExecutionSpaceInstances() {
419 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
421 #if KOKKOS_VERSION >= 40699
423 Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
426 Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
434 struct ToMultiVector {};
438 const local_ordinal_type blocksize_,
439 const local_ordinal_type_1d_view dm2cm_) {
440 blocksize = blocksize_;
443 #ifdef HAVE_IFPACK2_MPI
444 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
446 const tpetra_import_type
import(src_map, tgt_map);
448 createMpiRequests(
import);
449 createSendRecvIDs(
import);
450 createExecutionSpaceInstances();
453 void createDataBuffer(
const local_ordinal_type &num_vectors) {
454 const size_type extent_0 = lids.recv.extent(0) * blocksize;
455 const size_type extent_1 = num_vectors;
456 if (remote_multivector.extent(0) == extent_0 &&
457 remote_multivector.extent(1) == extent_1) {
463 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0) - 1] * blocksize * num_vectors;
464 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0) - 1] * blocksize * num_vectors;
469 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
470 buffer_host.send = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer send"), send_buffer_size);
471 buffer_host.recv = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer recv"), recv_buffer_size);
477 #ifdef HAVE_IFPACK2_MPI
478 waitall(reqs.recv.size(), reqs.recv.data());
479 waitall(reqs.send.size(), reqs.send.data());
487 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
488 template <
typename PackTag>
489 static void copy(
const local_ordinal_type_1d_view &lids_,
490 const impl_scalar_type_1d_view &buffer_,
491 const local_ordinal_type ibeg_,
492 const local_ordinal_type iend_,
493 const impl_scalar_type_2d_view_tpetra &multivector_,
494 const local_ordinal_type blocksize_,
495 const execution_space &exec_instance_) {
496 const local_ordinal_type num_vectors = multivector_.extent(1);
497 const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
498 const local_ordinal_type idiff = iend_ - ibeg_;
499 const auto abase = buffer_.data() + mv_blocksize * ibeg_;
501 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
502 local_ordinal_type vector_size(0);
505 else if (blocksize_ <= 8)
507 else if (blocksize_ <= 16)
512 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
513 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
514 Kokkos::parallel_for(
515 Kokkos::Experimental::require(policy, work_item_property),
516 KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
517 const local_ordinal_type i = member.league_rank();
518 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](
const local_ordinal_type &j) {
519 auto aptr = abase + blocksize_ * (i + idiff * j);
520 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
521 if (std::is_same<PackTag, ToBuffer>::value)
522 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
526 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
533 void asyncSendRecvVar1(
const impl_scalar_type_2d_view_tpetra &mv) {
534 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
536 #ifdef HAVE_IFPACK2_MPI
538 const local_ordinal_type num_vectors = mv.extent(1);
539 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
542 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
543 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
545 reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
546 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
552 reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
553 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
561 execution_space().fence();
564 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
566 if (i < 8) exec_instances[i % 8].fence();
567 copy<ToBuffer>(lids.send, buffer.send,
568 offset_host.send(i), offset_host.send(i + 1),
571 exec_instances[i % 8]);
572 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
574 const local_ordinal_type num_vectors = mv.extent(1);
575 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
577 Kokkos::deep_copy(exec_instances[i % 8],
578 Kokkos::subview(buffer_host.send,
579 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
580 offset_host.send(i) * mv_blocksize,
581 offset_host.send(i + 1) * mv_blocksize)),
582 Kokkos::subview(buffer.send,
583 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
584 offset_host.send(i) * mv_blocksize,
585 offset_host.send(i + 1) * mv_blocksize)));
590 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
592 if (i < 8) exec_instances[i % 8].fence();
593 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
595 reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
596 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
602 reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
603 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
611 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
614 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
616 #endif // HAVE_IFPACK2_MPI
617 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
620 void syncRecvVar1() {
621 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
622 #ifdef HAVE_IFPACK2_MPI
624 for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.recv.extent(0)); ++i) {
625 local_ordinal_type idx = i;
628 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
630 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
631 const local_ordinal_type num_vectors = remote_multivector.extent(1);
632 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
635 Kokkos::subview(buffer.recv,
636 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
637 offset_host.recv(idx) * mv_blocksize,
638 offset_host.recv(idx + 1) * mv_blocksize)),
639 Kokkos::subview(buffer_host.recv,
640 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
641 offset_host.recv(idx) * mv_blocksize,
642 offset_host.recv(idx + 1) * mv_blocksize)));
646 copy<ToMultiVector>(lids.recv, buffer.recv,
647 offset_host.recv(idx), offset_host.recv(idx + 1),
648 remote_multivector, blocksize,
649 exec_instances[idx % 8]);
656 waitall(reqs.send.size(), reqs.send.data());
657 #endif // HAVE_IFPACK2_MPI
658 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
660 #endif // defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
667 template <
typename PackTag>
668 static void copy(
const local_ordinal_type_1d_view &lids_,
669 const impl_scalar_type_1d_view &buffer_,
670 const local_ordinal_type &ibeg_,
671 const local_ordinal_type &iend_,
672 const impl_scalar_type_2d_view_tpetra &multivector_,
673 const local_ordinal_type blocksize_) {
674 const local_ordinal_type num_vectors = multivector_.extent(1);
675 const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
676 const local_ordinal_type idiff = iend_ - ibeg_;
677 const auto abase = buffer_.data() + mv_blocksize * ibeg_;
678 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
679 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
680 local_ordinal_type vector_size(0);
683 else if (blocksize_ <= 8)
685 else if (blocksize_ <= 16)
689 const team_policy_type policy(idiff, 1, vector_size);
690 Kokkos::parallel_for(
691 "AsyncableImport::TeamPolicy::copy",
692 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
693 const local_ordinal_type i = member.league_rank();
694 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](
const local_ordinal_type &j) {
695 auto aptr = abase + blocksize_ * (i + idiff * j);
696 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
697 if (std::is_same<PackTag, ToBuffer>::value)
698 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
702 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](
const local_ordinal_type &k) {
708 const Kokkos::RangePolicy<execution_space> policy(0, idiff * num_vectors);
709 Kokkos::parallel_for(
710 "AsyncableImport::RangePolicy::copy",
711 policy, KOKKOS_LAMBDA(
const local_ordinal_type &ij) {
712 const local_ordinal_type i = ij % idiff;
713 const local_ordinal_type j = ij / idiff;
714 auto aptr = abase + blocksize_ * (i + idiff * j);
715 auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
716 auto from = std::is_same<PackTag, ToBuffer>::value ? bptr : aptr;
717 auto to = std::is_same<PackTag, ToBuffer>::value ? aptr : bptr;
718 memcpy(to, from,
sizeof(impl_scalar_type) * blocksize_);
726 void asyncSendRecvVar0(
const impl_scalar_type_2d_view_tpetra &mv) {
727 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
729 #ifdef HAVE_IFPACK2_MPI
731 const local_ordinal_type num_vectors = mv.extent(1);
732 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
735 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
736 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
738 reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
739 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
745 reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
746 (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize *
sizeof(impl_scalar_type),
754 for (local_ordinal_type i = 0, iend = pids.send.extent(0); i < iend; ++i) {
755 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i + 1),
758 if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
760 reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
761 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
767 Kokkos::subview(buffer_host.send,
768 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
769 offset_host.send(i) * mv_blocksize,
770 offset_host.send(i + 1) * mv_blocksize)),
771 Kokkos::subview(buffer.send,
772 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
773 offset_host.send(i) * mv_blocksize,
774 offset_host.send(i + 1) * mv_blocksize)));
776 reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
777 (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize *
sizeof(impl_scalar_type),
786 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
789 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
792 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
795 void syncRecvVar0() {
796 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
797 #ifdef HAVE_IFPACK2_MPI
799 for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
800 local_ordinal_type idx = i;
801 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
802 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
803 const local_ordinal_type num_vectors = remote_multivector.extent(1);
804 const local_ordinal_type mv_blocksize = blocksize * num_vectors;
806 Kokkos::subview(buffer.recv,
807 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
808 offset_host.recv(idx) * mv_blocksize,
809 offset_host.recv(idx + 1) * mv_blocksize)),
810 Kokkos::subview(buffer_host.recv,
811 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
812 offset_host.recv(idx) * mv_blocksize,
813 offset_host.recv(idx + 1) * mv_blocksize)));
815 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx + 1),
816 remote_multivector, blocksize);
819 waitall(reqs.send.size(), reqs.send.data());
821 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
827 void asyncSendRecv(
const impl_scalar_type_2d_view_tpetra &mv) {
828 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
829 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
830 asyncSendRecvVar1(mv);
832 asyncSendRecvVar0(mv);
835 asyncSendRecvVar0(mv);
839 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
840 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
850 void syncExchange(
const impl_scalar_type_2d_view_tpetra &mv) {
851 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
854 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
857 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView()
const {
return remote_multivector; }
860 template <
typename ViewType1,
typename ViewType2>
861 struct are_same_struct {
865 are_same_struct(ViewType1 keys1_, ViewType2 keys2_)
868 KOKKOS_INLINE_FUNCTION
869 void operator()(
int i,
unsigned int &count)
const {
870 if (keys1(i) != keys2(i)) count++;
874 template <
typename ViewType1,
typename ViewType2>
875 bool are_same(ViewType1 keys1, ViewType2 keys2) {
876 unsigned int are_same_ = 0;
878 Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
879 are_same_struct(keys1, keys2),
881 return are_same_ == 0;
887 template <
typename MatrixType>
892 using tpetra_map_type =
typename impl_type::tpetra_map_type;
893 using local_ordinal_type =
typename impl_type::local_ordinal_type;
894 using global_ordinal_type =
typename impl_type::global_ordinal_type;
895 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
896 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
897 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
898 using global_indices_array_device_type = Kokkos::View<const global_ordinal_type *, typename tpetra_map_type::device_type>;
900 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
901 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
903 bool hasBlockCrsMatrix = !A_bcrs.is_null();
906 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
908 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
909 const auto domain_map = g.getDomainMap();
910 const auto column_map = g.getColMap();
912 std::vector<global_ordinal_type> gids;
914 Kokkos::Subview<global_indices_array_device_type, std::pair<int, int>> column_map_global_iD_last;
916 bool separate_remotes =
true, found_first =
false, need_owned_permutation =
false;
918 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
920 global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
921 global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
923 if (are_same(domain_map_global_iD, column_map_global_iD)) {
925 separate_remotes =
true;
926 need_owned_permutation =
false;
928 column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
929 std::pair<int, int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
932 for (
size_t i = 0; i < column_map->getLocalNumElements(); ++i) {
933 const global_ordinal_type gid = column_map->getGlobalElement(i);
934 if (!domain_map->isNodeGlobalElement(gid)) {
937 }
else if (found_first) {
938 separate_remotes =
false;
941 if (!found_first && !need_owned_permutation &&
942 domain_map->getLocalElement(gid) !=
static_cast<local_ordinal_type
>(i)) {
951 need_owned_permutation =
true;
955 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
958 if (separate_remotes) {
959 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
961 const auto parsimonious_col_map = need_owned_permutation ?
Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())) :
Teuchos::rcp(
new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
962 if (parsimonious_col_map->getGlobalNumElements() > 0) {
964 local_ordinal_type_1d_view dm2cm;
965 if (need_owned_permutation) {
966 dm2cm = local_ordinal_type_1d_view(
do_not_initialize_tag(
"dm2cm"), domain_map->getLocalNumElements());
967 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
968 for (
size_t i = 0; i < domain_map->getLocalNumElements(); ++i)
969 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
970 Kokkos::deep_copy(dm2cm, dm2cm_host);
972 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
973 return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
976 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
977 return Teuchos::null;
980 template <
typename local_ordinal_type>
981 local_ordinal_type costTRSM(
const local_ordinal_type block_size) {
982 return block_size * block_size;
985 template <
typename local_ordinal_type>
986 local_ordinal_type costGEMV(
const local_ordinal_type block_size) {
987 return 2 * block_size * block_size;
990 template <
typename local_ordinal_type>
991 local_ordinal_type costTriDiagSolve(
const local_ordinal_type subline_length,
const local_ordinal_type block_size) {
992 return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length - 1) * costGEMV(block_size);
995 template <
typename local_ordinal_type>
996 local_ordinal_type costSolveSchur(
const local_ordinal_type num_parts,
997 const local_ordinal_type num_teams,
998 const local_ordinal_type line_length,
999 const local_ordinal_type block_size,
1000 const local_ordinal_type n_subparts_per_part) {
1001 const local_ordinal_type subline_length = ceil(
double(line_length - (n_subparts_per_part - 1) * 2) / n_subparts_per_part);
1002 if (subline_length < 1) {
1006 const local_ordinal_type p_n_lines = ceil(
double(num_parts) / num_teams);
1007 const local_ordinal_type p_n_sublines = ceil(
double(n_subparts_per_part) * num_parts / num_teams);
1008 const local_ordinal_type p_n_sublines_2 = ceil(
double(n_subparts_per_part - 1) * num_parts / num_teams);
1010 const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1011 const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part - 1) * 2, block_size);
1012 const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length, block_size);
1013 const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1015 if (n_subparts_per_part == 1) {
1016 return p_costApplyAinv;
1018 return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1021 template <
typename local_ordinal_type>
1022 local_ordinal_type getAutomaticNSubparts(
const local_ordinal_type num_parts,
1023 const local_ordinal_type num_teams,
1024 const local_ordinal_type line_length,
1025 const local_ordinal_type block_size) {
1026 local_ordinal_type n_subparts_per_part_0 = 1;
1027 local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
1028 local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0 + 1);
1029 while (flop_0 > flop_1) {
1031 flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0) + 1);
1033 return n_subparts_per_part_0;
1036 template <
typename ArgActiveExecutionMemorySpace>
1037 struct SolveTridiagsDefaultModeAndAlgo;
1042 template <
typename MatrixType>
1043 BlockHelperDetails::PartInterface<MatrixType>
1045 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1047 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1050 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1051 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1052 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
1053 using size_type =
typename impl_type::size_type;
1055 auto bA = Teuchos::rcp_dynamic_cast<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1058 const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1059 constexpr
int vector_length = impl_type::vector_length;
1060 constexpr
int internal_vector_length = impl_type::internal_vector_length;
1062 const auto comm = A->getRowMap()->getComm();
1064 BlockHelperDetails::PartInterface<MatrixType> interf;
1066 const bool jacobi = partitions.size() == 0;
1067 const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1068 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1070 typedef std::pair<local_ordinal_type, local_ordinal_type> size_idx_pair_type;
1071 std::vector<size_idx_pair_type> partsz(nparts);
1074 for (local_ordinal_type i = 0; i < nparts; ++i)
1075 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1076 std::sort(partsz.begin(), partsz.end(),
1077 [](
const size_idx_pair_type &x,
const size_idx_pair_type &y) {
1078 return x.first > y.first;
1082 local_ordinal_type n_subparts_per_part;
1083 if (n_subparts_per_part_in == -1) {
1086 using execution_space =
typename impl_type::execution_space;
1088 const int line_length = partsz[0].first;
1090 const local_ordinal_type team_size =
1091 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1092 recommended_team_size(blocksize, vector_length, internal_vector_length);
1094 const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1096 n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1098 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1099 printf(
"Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1102 n_subparts_per_part = n_subparts_per_part_in;
1106 const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1109 const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part - 1);
1111 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1112 local_ordinal_type nrows = 0;
1116 for (local_ordinal_type i = 0; i < nparts; ++i) nrows += partitions[i].size();
1118 TEUCHOS_TEST_FOR_EXCEPT_MSG(nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) <<
"The #rows implied by the local partition is not "
1119 <<
"the same as getLocalNumRows: " << nrows <<
" vs " << A_n_lclrows);
1123 std::vector<local_ordinal_type> p;
1125 interf.max_partsz = 1;
1126 interf.max_subpartsz = 0;
1127 interf.n_subparts_per_part = 1;
1128 interf.nparts = nparts;
1133 for (local_ordinal_type i = 0; i < nparts; ++i)
1134 p[i] = partsz[i].second;
1136 interf.max_partsz = partsz[0].first;
1138 constexpr local_ordinal_type connection_length = 2;
1139 const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1140 const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1142 interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1143 interf.n_subparts_per_part = n_subparts_per_part;
1144 interf.nparts = nparts;
1150 interf.part2rowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0"), nparts + 1);
1151 interf.part2packrowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
1154 interf.part2rowidx0_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1155 interf.part2packrowidx0_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1156 interf.rowidx2part_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"rowidx2part"), A_n_lclrows);
1158 interf.partptr_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"partptr_sub"), n_sub_parts_and_schur, 2);
1161 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1162 const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1164 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1165 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1166 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1167 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1169 const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1170 const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1171 const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1174 interf.row_contiguous =
true;
1176 part2rowidx0(0) = 0;
1177 part2packrowidx0(0) = 0;
1178 local_ordinal_type pack_nrows = 0;
1179 local_ordinal_type pack_nrows_sub = 0;
1181 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices (Jacobi)", Jacobi);
1185 for (local_ordinal_type i = 0; i <= nparts; ++i) {
1186 part2rowidx0(i) = i;
1189 for (local_ordinal_type i = 0; i < nparts; ++i) {
1193 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1195 if (ip % vector_length == 0) pack_nrows = 1;
1196 part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1198 part2rowidx0_sub(0) = 0;
1199 partptr_sub(0, 0) = 0;
1201 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1202 constexpr local_ordinal_type ipnrows = 1;
1203 const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1206 "In the part " << ip);
1208 constexpr local_ordinal_type connection_length = 2;
1210 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1212 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1214 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1215 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1217 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1219 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1220 const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1221 const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1222 if (local_sub_ip != n_subparts_per_part - 1) {
1223 if (local_sub_ip != 0) {
1224 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1225 }
else if (ip != 0) {
1226 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1228 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1229 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1230 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1232 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1233 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1235 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1236 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1237 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1240 if (local_sub_ip != 0) {
1241 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1242 }
else if (ip != 0) {
1243 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1245 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1247 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1249 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1250 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1256 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1257 std::cout <<
"partptr_sub = " << std::endl;
1258 for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1259 for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1260 std::cout << partptr_sub(i, j) <<
" ";
1262 std::cout << std::endl;
1264 std::cout <<
"partptr_sub end" << std::endl;
1268 local_ordinal_type npacks = ceil(
float(nparts) / vector_length);
1270 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1271 for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1272 part2packrowidx0_sub(ip, 0) = 0;
1274 for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1276 local_ordinal_type ip_min = ipack * vector_length;
1277 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1278 for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1279 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1283 for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1284 local_ordinal_type ip_min = ipack * vector_length;
1285 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1287 const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1289 constexpr local_ordinal_type connection_length = 2;
1291 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1292 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1294 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1295 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1296 if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1298 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1300 for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1301 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1306 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1308 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1310 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices", indices);
1311 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1312 const auto *part = &partitions[p[ip]];
1313 const local_ordinal_type ipnrows = part->size();
1314 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip - 1]].size())));
1316 BlockHelperDetails::get_msg_prefix(comm)
1317 <<
"partition " << p[ip]
1318 <<
" is empty, which is not allowed.");
1320 part2rowidx0(ip + 1) = part2rowidx0(ip) + ipnrows;
1323 if (ip % vector_length == 0) pack_nrows = ipnrows;
1324 part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1325 const local_ordinal_type offset = partptr(ip);
1326 for (local_ordinal_type i = 0; i < ipnrows; ++i) {
1327 const auto lcl_row = (*part)[i];
1329 BlockHelperDetails::get_msg_prefix(comm)
1330 <<
"partitions[" << p[ip] <<
"]["
1331 << i <<
"] = " << lcl_row
1332 <<
" but input matrix implies limits of [0, " << A_n_lclrows - 1
1334 lclrow(offset + i) = lcl_row;
1335 rowidx2part(offset + i) = ip;
1336 if (interf.row_contiguous && offset + i > 0 && lclrow((offset + i) - 1) + 1 != lcl_row)
1337 interf.row_contiguous =
false;
1339 partptr(ip + 1) = offset + ipnrows;
1341 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1342 printf(
"Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1343 printf(
"partptr(%d+1) = %d\n", ip, partptr(ip + 1));
1347 part2rowidx0_sub(0) = 0;
1348 partptr_sub(0, 0) = 0;
1351 for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1352 const auto *part = &partitions[p[ip]];
1353 const local_ordinal_type ipnrows = part->size();
1354 const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1357 "In the part " << ip);
1359 constexpr local_ordinal_type connection_length = 2;
1361 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1363 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1365 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1366 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1368 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1370 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1371 const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1372 const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1373 if (local_sub_ip != n_subparts_per_part - 1) {
1374 if (local_sub_ip != 0) {
1375 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1376 }
else if (ip != 0) {
1377 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1379 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1380 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1381 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1383 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1384 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1386 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1387 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1388 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1391 if (local_sub_ip != 0) {
1392 partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1393 }
else if (ip != 0) {
1394 partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1396 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1398 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1400 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1401 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1408 local_ordinal_type npacks = ceil(
float(nparts) / vector_length);
1410 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1411 for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1412 part2packrowidx0_sub(ip, 0) = 0;
1414 for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1416 local_ordinal_type ip_min = ipack * vector_length;
1417 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1418 for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1419 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1423 for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1424 local_ordinal_type ip_min = ipack * vector_length;
1425 ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1427 const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1429 constexpr local_ordinal_type connection_length = 2;
1431 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1432 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1434 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1435 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1436 if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1438 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1440 for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1441 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1446 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1448 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1450 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1453 if (lclrow(0) != 0) interf.row_contiguous =
false;
1455 Kokkos::deep_copy(interf.partptr, partptr);
1456 Kokkos::deep_copy(interf.lclrow, lclrow);
1458 Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1461 interf.part2rowidx0 = interf.partptr;
1462 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1464 interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1465 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1468 IFPACK2_BLOCKHELPER_TIMER(
"Fill packptr", packptr0);
1469 local_ordinal_type npacks = ceil(
float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1471 for (local_ordinal_type ip = 1; ip <= nparts; ++ip)
1472 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1476 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1478 for (local_ordinal_type ip = 1, k = 1; ip <= nparts; ++ip)
1479 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1482 Kokkos::deep_copy(interf.packptr, packptr);
1484 local_ordinal_type npacks_per_subpart = ceil(
float(nparts) / vector_length);
1485 npacks = ceil(
float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1487 interf.packindices_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"packindices_sub"), npacks_per_subpart * n_subparts_per_part);
1488 interf.packindices_schur = local_ordinal_type_2d_view(
do_not_initialize_tag(
"packindices_schur"), npacks_per_subpart, n_subparts_per_part - 1);
1490 const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1491 const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1494 for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part - 1; ++local_sub_ip) {
1495 for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1496 packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1497 packindices_schur(local_pack_ip, local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1501 for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1502 packindices_sub((n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip;
1505 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1506 std::cout <<
"packindices_sub = " << std::endl;
1507 for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1508 std::cout << packindices_sub(i) <<
" ";
1510 std::cout << std::endl;
1511 std::cout <<
"packindices_sub end" << std::endl;
1513 std::cout <<
"packindices_schur = " << std::endl;
1514 for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1515 for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1516 std::cout << packindices_schur(i, j) <<
" ";
1518 std::cout << std::endl;
1521 std::cout <<
"packindices_schur end" << std::endl;
1524 Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1525 Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1528 const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1530 for (local_ordinal_type k = 0; k < npacks + 1; ++k)
1531 packptr_sub(k) = packptr(k % npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1533 Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1534 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1536 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1544 template <
typename MatrixType>
1547 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1549 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1550 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1551 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1552 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
1558 size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1561 local_ordinal_type_1d_view A_colindsub;
1564 vector_type_3d_view values;
1567 vector_type_3d_view values_schur;
1569 vector_type_4d_view e_values;
1574 size_type_1d_view diag_offsets;
1578 btdm_scalar_type_3d_view d_inv;
1580 bool is_diagonal_only;
1586 template <
typename idx_type>
1587 static KOKKOS_FORCEINLINE_FUNCTION
1589 IndexToRow(
const idx_type &ind) {
return (ind + 1) / 3; }
1592 template <
typename idx_type>
1593 static KOKKOS_FORCEINLINE_FUNCTION
1595 RowToIndex(
const idx_type &row) {
return row > 0 ? 3 * row - 1 : 0; }
1597 template <
typename idx_type>
1598 static KOKKOS_FORCEINLINE_FUNCTION
1600 NumBlocks(
const idx_type &nrows) {
return nrows > 0 ? 3 * nrows - 2 : 0; }
1602 template <
typename idx_type>
1603 static KOKKOS_FORCEINLINE_FUNCTION
1605 NumBlocksSchur(
const idx_type &nrows) {
return nrows > 0 ? 3 * nrows + 2 : 0; }
1611 template <
typename MatrixType>
1614 IFPACK2_BLOCKHELPER_TIMER(
"createBlockTridiags", createBlockTridiags0);
1616 using execution_space =
typename impl_type::execution_space;
1617 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1618 using size_type =
typename impl_type::size_type;
1619 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1621 constexpr
int vector_length = impl_type::vector_length;
1625 const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1628 btdm.flat_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.flat_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1629 const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part);
1630 Kokkos::parallel_scan(
1631 "createBlockTridiags::RangePolicy::flat_td_ptr",
1632 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1633 const local_ordinal_type partidx = i / (2 * interf.n_subparts_per_part);
1634 const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1637 btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1639 if (local_subpartidx != (2 * interf.n_subparts_per_part - 1)) {
1640 const local_ordinal_type nrows = interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 1) - interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 0);
1641 if (local_subpartidx % 2 == 0)
1642 update += btdm.NumBlocks(nrows);
1644 update += btdm.NumBlocksSchur(nrows);
1648 const auto nblocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts - 1, 2 * interf.n_subparts_per_part - 1));
1649 btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
1653 if (vector_length == 1) {
1654 btdm.pack_td_ptr = btdm.flat_td_ptr;
1658 local_ordinal_type npacks_per_subpart = 0;
1659 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1660 Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1661 for (local_ordinal_type ip = 1; ip <= interf.nparts; ++ip)
1662 if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1663 ++npacks_per_subpart;
1665 btdm.pack_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1666 const Kokkos::RangePolicy<execution_space> policy(0, npacks_per_subpart);
1668 Kokkos::parallel_for(
1669 "createBlockTridiags::RangePolicy::pack_td_ptr",
1670 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1671 for (local_ordinal_type j = 0; j < 2 * interf.n_subparts_per_part; ++j) {
1672 const local_ordinal_type pack_id = (j == 2 * interf.n_subparts_per_part - 1) ? i + (j - 1) * npacks_per_subpart : i + j * npacks_per_subpart;
1673 const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id + 1) - interf.packptr_sub(pack_id);
1675 const local_ordinal_type parti = interf.packptr_sub(pack_id);
1676 const local_ordinal_type partidx = parti % interf.nparts;
1678 for (local_ordinal_type pti = 0; pti < nparts_in_pack; ++pti) {
1679 btdm.pack_td_ptr(partidx + pti, j) = btdm.flat_td_ptr(i, j);
1685 btdm.pack_td_ptr_schur = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1687 const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1688 constexpr local_ordinal_type connection_length = 2;
1690 host_pack_td_ptr_schur(0, 0) = 0;
1691 for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1692 if (i % vector_length == 0) {
1694 host_pack_td_ptr_schur(i, 0) = host_pack_td_ptr_schur(i - 1, host_pack_td_ptr_schur.extent(1) - 1);
1695 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part - 1; ++j) {
1696 host_pack_td_ptr_schur(i, j + 1) = host_pack_td_ptr_schur(i, j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part - 2 ? 1 : 0);
1699 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1700 host_pack_td_ptr_schur(i, j) = host_pack_td_ptr_schur(i - 1, j);
1705 Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1707 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1708 const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1709 std::cout <<
"flat_td_ptr = " << std::endl;
1710 for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1711 for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1712 std::cout << host_flat_td_ptr(i, j) <<
" ";
1714 std::cout << std::endl;
1716 std::cout <<
"flat_td_ptr end" << std::endl;
1718 const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1720 std::cout <<
"pack_td_ptr = " << std::endl;
1721 for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1722 for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1723 std::cout << host_pack_td_ptr(i, j) <<
" ";
1725 std::cout << std::endl;
1727 std::cout <<
"pack_td_ptr end" << std::endl;
1729 std::cout <<
"pack_td_ptr_schur = " << std::endl;
1730 for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1731 for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1732 std::cout << host_pack_td_ptr_schur(i, j) <<
" ";
1734 std::cout << std::endl;
1736 std::cout <<
"pack_td_ptr_schur end" << std::endl;
1740 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1755 template <
typename MatrixType>
1756 void setTridiagsToIdentity(
const BlockTridiags<MatrixType> &btdm,
1757 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1759 using execution_space =
typename impl_type::execution_space;
1760 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1761 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1763 const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1764 const local_ordinal_type blocksize = btdm.values.extent(1);
1767 const int vector_length = impl_type::vector_length;
1768 const int internal_vector_length = impl_type::internal_vector_length;
1770 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
1771 using internal_vector_type =
typename impl_type::internal_vector_type;
1772 using internal_vector_type_4d_view =
1773 typename impl_type::internal_vector_type_4d_view;
1775 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1776 const internal_vector_type_4d_view values(reinterpret_cast<internal_vector_type *>(btdm.values.data()),
1777 btdm.values.extent(0),
1778 btdm.values.extent(1),
1779 btdm.values.extent(2),
1780 vector_length / internal_vector_length);
1781 const local_ordinal_type vector_loop_size = values.extent(3);
1782 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1783 local_ordinal_type total_team_size(0);
1785 total_team_size = 32;
1786 else if (blocksize <= 9)
1787 total_team_size = 64;
1788 else if (blocksize <= 12)
1789 total_team_size = 96;
1790 else if (blocksize <= 16)
1791 total_team_size = 128;
1792 else if (blocksize <= 20)
1793 total_team_size = 160;
1795 total_team_size = 160;
1796 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1797 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1798 #elif defined(KOKKOS_ENABLE_HIP)
1803 local_ordinal_type total_team_size(0);
1805 total_team_size = 32;
1806 else if (blocksize <= 9)
1807 total_team_size = 64;
1808 else if (blocksize <= 12)
1809 total_team_size = 96;
1810 else if (blocksize <= 16)
1811 total_team_size = 128;
1812 else if (blocksize <= 20)
1813 total_team_size = 160;
1815 total_team_size = 160;
1816 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1817 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1818 #elif defined(KOKKOS_ENABLE_SYCL)
1820 local_ordinal_type total_team_size(0);
1822 total_team_size = 32;
1823 else if (blocksize <= 9)
1824 total_team_size = 64;
1825 else if (blocksize <= 12)
1826 total_team_size = 96;
1827 else if (blocksize <= 16)
1828 total_team_size = 128;
1829 else if (blocksize <= 20)
1830 total_team_size = 160;
1832 total_team_size = 160;
1833 const local_ordinal_type team_size = total_team_size / vector_loop_size;
1834 const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1837 const team_policy_type policy(packptr.extent(0) - 1, 1, 1);
1839 Kokkos::parallel_for(
1840 "setTridiagsToIdentity::TeamPolicy",
1841 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
1842 const local_ordinal_type k = member.league_rank();
1843 const local_ordinal_type ibeg = pack_td_ptr(packptr(k), 0);
1844 const local_ordinal_type iend = pack_td_ptr(packptr(k), pack_td_ptr.extent(1) - 1);
1846 const local_ordinal_type diff = iend - ibeg;
1847 const local_ordinal_type icount = diff / 3 + (diff % 3 > 0);
1848 const btdm_scalar_type one(1);
1849 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
1850 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, icount), [&](
const local_ordinal_type &ii) {
1851 const local_ordinal_type i = ibeg + ii * 3;
1852 for (local_ordinal_type j = 0; j < blocksize; ++j) {
1853 values(i, j, j, v) = one;
1864 template <
typename MatrixType>
1866 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1867 const BlockHelperDetails::PartInterface<MatrixType> &interf,
1870 const bool overlap_communication_and_computation,
1871 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
1873 bool use_fused_jacobi) {
1874 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SymbolicPhase", SymbolicPhase);
1878 using execution_space =
typename impl_type::execution_space;
1879 using host_execution_space =
typename impl_type::host_execution_space;
1881 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1882 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1883 using size_type =
typename impl_type::size_type;
1884 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1885 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1886 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1887 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1888 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
1889 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1890 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
1892 constexpr
int vector_length = impl_type::vector_length;
1894 const auto comm = A->getRowMap()->getComm();
1896 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
1897 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
1899 bool hasBlockCrsMatrix = !A_bcrs.is_null();
1901 const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows();
1904 const auto partptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.partptr);
1905 const auto lclrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.lclrow);
1906 const auto rowidx2part = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.rowidx2part);
1907 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1908 const auto packptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.packptr);
1910 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1912 Kokkos::View<local_ordinal_type *, host_execution_space> col2row(
"col2row", A->getLocalNumCols());
1918 const auto rowmap = g->getRowMap();
1919 const auto colmap = g->getColMap();
1920 const auto dommap = g->getDomainMap();
1921 TEUCHOS_ASSERT(!(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1923 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1924 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
1925 Kokkos::parallel_for(
1926 "performSymbolicPhase::RangePolicy::col2row",
1927 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1928 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1930 if (dommap->isNodeGlobalElement(gid)) {
1931 const local_ordinal_type lc = colmap->getLocalElement(gid);
1932 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1934 BlockHelperDetails::get_msg_prefix(comm) <<
"GID " << gid
1935 <<
" gives an invalid local column.");
1945 const auto local_graph = g->getLocalGraphHost();
1946 const auto local_graph_rowptr = local_graph.row_map;
1947 TEUCHOS_ASSERT(local_graph_rowptr.size() ==
static_cast<size_t>(nrows + 1));
1948 const auto local_graph_colidx = local_graph.entries;
1952 Kokkos::View<local_ordinal_type *, host_execution_space> lclrow2idx(
"lclrow2idx", nrows);
1954 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
1955 Kokkos::parallel_for(
1956 "performSymbolicPhase::RangePolicy::lclrow2idx",
1957 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1958 lclrow2idx[lclrow(i)] = i;
1964 typename sum_reducer_type::value_type sum_reducer_value;
1966 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
1967 Kokkos::parallel_reduce
1970 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
typename sum_reducer_type::value_type &update) {
1972 const local_ordinal_type ri0 = lclrow2idx[lr];
1973 const local_ordinal_type pi0 = rowidx2part(ri0);
1974 for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) {
1975 const local_ordinal_type lc = local_graph_colidx(j);
1976 const local_ordinal_type lc2r = col2row[lc];
1977 bool incr_R =
false;
1979 if (lc2r == (local_ordinal_type)-1) {
1983 const local_ordinal_type ri = lclrow2idx[lc2r];
1984 const local_ordinal_type pi = rowidx2part(ri);
1992 if (ri0 + 1 >= ri && ri0 <= ri + 1)
2005 sum_reducer_type(sum_reducer_value));
2007 size_type D_nnz = sum_reducer_value.v[0];
2008 size_type R_nnz_owned = sum_reducer_value.v[1];
2009 size_type R_nnz_remote = sum_reducer_value.v[2];
2011 if (!overlap_communication_and_computation) {
2012 R_nnz_owned += R_nnz_remote;
2018 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2020 btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
2021 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2023 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2027 const local_ordinal_type nparts = partptr.extent(0) - 1;
2030 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2031 Kokkos::parallel_for(
2032 "performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2033 policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
2034 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2035 local_ordinal_type offset = 0;
2036 for (local_ordinal_type ri0 = partptr(pi0); ri0 < partptr(pi0 + 1); ++ri0) {
2037 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2039 const local_ordinal_type lr0 = lclrow(ri0);
2040 const size_type j0 = local_graph_rowptr(lr0);
2041 for (size_type j = j0; j < local_graph_rowptr(lr0 + 1); ++j) {
2042 const local_ordinal_type lc = local_graph_colidx(j);
2043 const local_ordinal_type lc2r = col2row[lc];
2044 if (lc2r == (local_ordinal_type)-1)
continue;
2045 const local_ordinal_type ri = lclrow2idx[lc2r];
2046 const local_ordinal_type pi = rowidx2part(ri);
2047 if (pi != pi0)
continue;
2048 if (ri + 1 < ri0 || ri > ri0 + 1)
continue;
2049 const local_ordinal_type row_entry = j - j0;
2050 D_A_colindsub(flat_td_ptr(pi0, 0) + ((td_row_os + ri) - ri0)) = row_entry;
2055 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2056 for (
size_t i = 0; i < D_A_colindsub.extent(0); ++i)
2059 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2063 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0) - 1, btdm.pack_td_ptr.extent(1) - 1);
2064 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2065 btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
2067 if (interf.n_subparts_per_part > 1) {
2068 const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0) - 1, btdm.pack_td_ptr_schur.extent(1) - 1);
2069 const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2070 btdm.values_schur = vector_type_3d_view(
"btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2073 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2079 amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
2080 amd.A_colindsub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub"), R_nnz_owned);
2082 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2083 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2085 amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2086 amd.A_colindsub_remote = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub_remote"), R_nnz_remote);
2088 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2089 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2092 const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2093 Kokkos::parallel_for(
2094 "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2095 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
2096 const local_ordinal_type ri0 = lclrow2idx[lr];
2097 const local_ordinal_type pi0 = rowidx2part(ri0);
2098 const size_type j0 = local_graph_rowptr(lr);
2099 for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2100 const local_ordinal_type lc = local_graph_colidx(j);
2101 const local_ordinal_type lc2r = col2row[lc];
2102 if (lc2r != (local_ordinal_type)-1) {
2103 const local_ordinal_type ri = lclrow2idx[lc2r];
2104 const local_ordinal_type pi = rowidx2part(ri);
2105 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2110 if (!overlap_communication_and_computation || lc < nrows) {
2113 ++R_rowptr_remote(lr);
2122 Kokkos::RangePolicy<host_execution_space> policy(0, nrows + 1);
2123 Kokkos::parallel_scan(
2124 "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2125 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr, update_type &update,
const bool &
final) {
2127 val.v[0] = R_rowptr(lr);
2128 if (overlap_communication_and_computation)
2129 val.v[1] = R_rowptr_remote(lr);
2132 R_rowptr(lr) = update.v[0];
2133 if (overlap_communication_and_computation)
2134 R_rowptr_remote(lr) = update.v[1];
2137 const local_ordinal_type ri0 = lclrow2idx[lr];
2138 const local_ordinal_type pi0 = rowidx2part(ri0);
2140 size_type cnt_rowptr = R_rowptr(lr);
2141 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0;
2143 const size_type j0 = local_graph_rowptr(lr);
2144 for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2145 const local_ordinal_type lc = local_graph_colidx(j);
2146 const local_ordinal_type lc2r = col2row[lc];
2147 if (lc2r != (local_ordinal_type)-1) {
2148 const local_ordinal_type ri = lclrow2idx[lc2r];
2149 const local_ordinal_type pi = rowidx2part(ri);
2150 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2153 const local_ordinal_type row_entry = j - j0;
2154 if (!overlap_communication_and_computation || lc < nrows)
2155 R_A_colindsub(cnt_rowptr++) = row_entry;
2157 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2165 Kokkos::deep_copy(amd.rowptr, R_rowptr);
2166 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2167 if (overlap_communication_and_computation) {
2169 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2170 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2174 if (hasBlockCrsMatrix)
2175 amd.tpetra_values = (
const_cast<block_crs_matrix_type *
>(A_bcrs.get())->getValuesDeviceNonConst());
2177 amd.tpetra_values = (
const_cast<crs_matrix_type *
>(A_crs.get()))->getLocalValuesDevice(Tpetra::Access::ReadWrite);
2183 if (interf.n_subparts_per_part > 1)
2184 btdm.e_values = vector_type_4d_view(
"btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2194 if (BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix) {
2195 bool is_async_importer_active = !async_importer.is_null();
2196 local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2197 bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2198 BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2202 if (use_fused_jacobi) {
2203 btdm.d_inv = btdm_scalar_type_3d_view(
do_not_initialize_tag(
"btdm.d_inv"), interf.nparts, blocksize, blocksize);
2204 auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2205 auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2206 btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2208 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2214 template <
typename ArgActiveExecutionMemorySpace>
2219 typedef KB::Mode::Serial mode_type;
2220 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2221 typedef KB::Algo::Level3::CompactMKL algo_type;
2223 typedef KB::Algo::Level3::Blocked algo_type;
2225 static int recommended_team_size(
const int ,
2232 #if defined(KOKKOS_ENABLE_CUDA)
2233 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(
const int blksize,
2234 const int vector_length,
2235 const int internal_vector_length) {
2236 const int vector_size = vector_length / internal_vector_length;
2237 int total_team_size(0);
2239 total_team_size = 32;
2240 else if (blksize <= 9)
2241 total_team_size = 32;
2242 else if (blksize <= 12)
2243 total_team_size = 96;
2244 else if (blksize <= 16)
2245 total_team_size = 128;
2246 else if (blksize <= 20)
2247 total_team_size = 160;
2249 total_team_size = 160;
2250 return 2 * total_team_size / vector_size;
2253 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2254 typedef KB::Mode::Team mode_type;
2255 typedef KB::Algo::Level3::Unblocked algo_type;
2256 static int recommended_team_size(
const int blksize,
2257 const int vector_length,
2258 const int internal_vector_length) {
2259 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2263 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2264 typedef KB::Mode::Team mode_type;
2265 typedef KB::Algo::Level3::Unblocked algo_type;
2266 static int recommended_team_size(
const int blksize,
2267 const int vector_length,
2268 const int internal_vector_length) {
2269 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2274 #if defined(KOKKOS_ENABLE_HIP)
2275 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(
const int blksize,
2276 const int vector_length,
2277 const int internal_vector_length) {
2278 const int vector_size = vector_length / internal_vector_length;
2279 int total_team_size(0);
2281 total_team_size = 32;
2282 else if (blksize <= 9)
2283 total_team_size = 32;
2284 else if (blksize <= 12)
2285 total_team_size = 96;
2286 else if (blksize <= 16)
2287 total_team_size = 128;
2288 else if (blksize <= 20)
2289 total_team_size = 160;
2291 total_team_size = 160;
2292 return 2 * total_team_size / vector_size;
2295 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2296 typedef KB::Mode::Team mode_type;
2297 typedef KB::Algo::Level3::Unblocked algo_type;
2298 static int recommended_team_size(
const int blksize,
2299 const int vector_length,
2300 const int internal_vector_length) {
2301 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2305 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2306 typedef KB::Mode::Team mode_type;
2307 typedef KB::Algo::Level3::Unblocked algo_type;
2308 static int recommended_team_size(
const int blksize,
2309 const int vector_length,
2310 const int internal_vector_length) {
2311 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2316 #if defined(KOKKOS_ENABLE_SYCL)
2317 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(
const int blksize,
2318 const int vector_length,
2319 const int internal_vector_length) {
2320 const int vector_size = vector_length / internal_vector_length;
2321 int total_team_size(0);
2323 total_team_size = 32;
2324 else if (blksize <= 9)
2325 total_team_size = 32;
2326 else if (blksize <= 12)
2327 total_team_size = 96;
2328 else if (blksize <= 16)
2329 total_team_size = 128;
2330 else if (blksize <= 20)
2331 total_team_size = 160;
2333 total_team_size = 160;
2334 return 2 * total_team_size / vector_size;
2337 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2338 typedef KB::Mode::Team mode_type;
2339 typedef KB::Algo::Level3::Unblocked algo_type;
2340 static int recommended_team_size(
const int blksize,
2341 const int vector_length,
2342 const int internal_vector_length) {
2343 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2347 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2348 typedef KB::Mode::Team mode_type;
2349 typedef KB::Algo::Level3::Unblocked algo_type;
2350 static int recommended_team_size(
const int blksize,
2351 const int vector_length,
2352 const int internal_vector_length) {
2353 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2358 template <
typename impl_type,
typename WWViewType>
2359 KOKKOS_INLINE_FUNCTION
void
2360 solveMultiVector(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2361 const typename impl_type::local_ordinal_type & ,
2362 const typename impl_type::local_ordinal_type &i0,
2363 const typename impl_type::local_ordinal_type &r0,
2364 const typename impl_type::local_ordinal_type &nrows,
2365 const typename impl_type::local_ordinal_type &v,
2366 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2367 const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2368 const WWViewType &WW,
2369 const bool skip_first_pass =
false) {
2370 using execution_space =
typename impl_type::execution_space;
2371 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2372 using member_type =
typename team_policy_type::member_type;
2373 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2375 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2377 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2378 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2380 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2383 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2384 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2387 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2388 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2391 local_ordinal_type i = i0, r = r0;
2395 if (skip_first_pass) {
2396 i += (nrows - 2) * 3;
2398 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2399 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2400 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2401 KB::Trsm<member_type,
2402 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2403 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2404 X1.assign_data(X2.data());
2407 KB::Trsm<member_type,
2408 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2409 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2410 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
2411 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2412 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2413 member.team_barrier();
2414 KB::Gemm<member_type,
2415 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2416 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2417 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2418 KB::Trsm<member_type,
2419 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2420 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2421 X1.assign_data(X2.data());
2426 KB::Trsm<member_type,
2427 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2428 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2429 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2431 A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
2432 X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
2433 member.team_barrier();
2434 KB::Gemm<member_type,
2435 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2436 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2438 A.assign_data(&D_internal_vector_values(i, 0, 0, v));
2439 KB::Trsm<member_type,
2440 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2441 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2442 X1.assign_data(X2.data());
2446 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2447 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
2448 member.team_barrier();
2449 KB::Gemm<member_type,
2450 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2451 default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
2455 template <
typename impl_type,
typename WWViewType,
typename XViewType>
2456 KOKKOS_INLINE_FUNCTION
void
2457 solveSingleVectorNew(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2458 const typename impl_type::local_ordinal_type &blocksize,
2459 const typename impl_type::local_ordinal_type &i0,
2460 const typename impl_type::local_ordinal_type &r0,
2461 const typename impl_type::local_ordinal_type &nrows,
2462 const typename impl_type::local_ordinal_type &v,
2463 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2464 const XViewType &X_internal_vector_values,
2465 const WWViewType &WW) {
2466 using execution_space =
typename impl_type::execution_space;
2469 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2471 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2473 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2474 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2476 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2479 auto A = D_internal_vector_values.data();
2480 auto X = X_internal_vector_values.data();
2483 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2484 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2488 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2489 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
2490 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
2491 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2492 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
2495 A += i0 * astep + v;
2496 X += r0 * xstep + v;
2501 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2504 blocksize, blocksize,
2509 for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
2510 member.team_barrier();
2511 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2513 blocksize, blocksize,
2515 A + 2 * astep, as0, as1,
2518 X + 1 * xstep, xs0);
2519 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2522 blocksize, blocksize,
2524 A + 3 * astep, as0, as1,
2525 X + 1 * xstep, xs0);
2532 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2535 blocksize, blocksize,
2540 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2542 member.team_barrier();
2543 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2545 blocksize, blocksize,
2547 A + 1 * astep, as0, as1,
2550 X - 1 * xstep, xs0);
2551 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2554 blocksize, blocksize,
2557 X - 1 * xstep, xs0);
2563 const local_ordinal_type ws0 = WW.stride_0();
2564 auto W = WW.data() + v;
2565 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
2566 member, blocksize, X, xs0, W, ws0);
2567 member.team_barrier();
2568 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2570 blocksize, blocksize,
2579 template <
typename local_ordinal_type,
typename ViewType>
2580 void writeBTDValuesToFile(
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2581 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2582 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2583 std::ofstream myfile;
2584 myfile.open(fileName);
2586 const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type)scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2587 local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2588 const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2589 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2591 const local_ordinal_type block_size = scalar_values.extent(1);
2593 const local_ordinal_type n_rows_per_part = (n_blocks_per_part + 2) / 3 * block_size;
2594 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2596 const local_ordinal_type n_packs = ceil(
float(n_parts) / n_parts_per_pack);
2598 myfile <<
"%%MatrixMarket matrix coordinate real general" << std::endl;
2599 myfile <<
"%%nnz = " << nnz;
2600 myfile <<
" block size = " << block_size;
2601 myfile <<
" number of blocks = " << n_blocks;
2602 myfile <<
" number of parts = " << n_parts;
2603 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2604 myfile <<
" number of rows = " << n_rows;
2605 myfile <<
" number of cols = " << n_rows;
2606 myfile <<
" number of packs = " << n_packs << std::endl;
2608 myfile << n_rows <<
" " << n_rows <<
" " << nnz << std::setprecision(9) << std::endl;
2610 local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2611 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2612 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2613 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2614 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2615 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2616 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2618 if (i_block_in_part % 3 == 0) {
2619 current_row_offset = i_block_in_part / 3 * block_size;
2620 current_col_offset = i_block_in_part / 3 * block_size;
2621 }
else if (i_block_in_part % 3 == 1) {
2622 current_row_offset = (i_block_in_part - 1) / 3 * block_size;
2623 current_col_offset = ((i_block_in_part - 1) / 3 + 1) * block_size;
2624 }
else if (i_block_in_part % 3 == 2) {
2625 current_row_offset = ((i_block_in_part - 2) / 3 + 1) * block_size;
2626 current_col_offset = (i_block_in_part - 2) / 3 * block_size;
2628 current_row_offset += current_part_idx * n_rows_per_part;
2629 current_col_offset += current_part_idx * n_rows_per_part;
2630 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2631 for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2632 current_row = current_row_offset + i_in_block + 1;
2633 current_col = current_col_offset + j_in_block + 1;
2634 myfile << current_row <<
" " << current_col <<
" " << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2645 template <
typename local_ordinal_type,
typename ViewType>
2646 void write4DMultiVectorValuesToFile(
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2647 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2648 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2649 std::ofstream myfile;
2650 myfile.open(fileName);
2652 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2653 const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2654 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2656 const local_ordinal_type block_size = scalar_values.extent(1);
2657 const local_ordinal_type n_cols = scalar_values.extent(2);
2659 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2660 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2662 const local_ordinal_type n_packs = ceil(
float(n_parts) / n_parts_per_pack);
2664 myfile <<
"%%MatrixMarket matrix array real general" << std::endl;
2665 myfile <<
"%%block size = " << block_size;
2666 myfile <<
" number of blocks = " << n_blocks;
2667 myfile <<
" number of parts = " << n_parts;
2668 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2669 myfile <<
" number of rows = " << n_rows;
2670 myfile <<
" number of cols = " << n_cols;
2671 myfile <<
" number of packs = " << n_packs << std::endl;
2673 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2675 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2676 (void)current_row_offset;
2677 (void)current_part_idx;
2678 for (local_ordinal_type j_in_block = 0; j_in_block < n_cols; ++j_in_block) {
2679 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2680 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2681 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2682 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2683 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2685 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2687 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2688 myfile << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2698 template <
typename local_ordinal_type,
typename ViewType>
2699 void write5DMultiVectorValuesToFile(
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2700 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2701 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2702 std::ofstream myfile;
2703 myfile.open(fileName);
2705 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2706 const local_ordinal_type n_blocks = scalar_values.extent(1) * n_parts_per_pack;
2707 const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2709 const local_ordinal_type block_size = scalar_values.extent(2);
2710 const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2711 const local_ordinal_type n_cols = n_blocks_cols * block_size;
2713 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2714 const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2716 const local_ordinal_type n_packs = ceil(
float(n_parts) / n_parts_per_pack);
2718 myfile <<
"%%MatrixMarket matrix array real general" << std::endl;
2719 myfile <<
"%%block size = " << block_size;
2720 myfile <<
" number of blocks = " << n_blocks;
2721 myfile <<
" number of parts = " << n_parts;
2722 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2723 myfile <<
" number of rows = " << n_rows;
2724 myfile <<
" number of cols = " << n_cols;
2725 myfile <<
" number of packs = " << n_packs << std::endl;
2727 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2729 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2730 (void)current_row_offset;
2731 (void)current_part_idx;
2732 for (local_ordinal_type i_block_col = 0; i_block_col < n_blocks_cols; ++i_block_col) {
2733 for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2734 for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2735 for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2736 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2737 for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2738 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2740 if (current_block_idx >= (local_ordinal_type)scalar_values.extent(1))
2742 for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2743 myfile << scalar_values(i_block_col, current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2754 template <
typename local_ordinal_type,
typename member_type,
typename ViewType1,
typename ViewType2>
2755 KOKKOS_INLINE_FUNCTION
void
2756 copy3DView(
const member_type &member,
const ViewType1 &view1,
const ViewType2 &view2) {
2769 Kokkos::Experimental::local_deep_copy(member, view1, view2);
2771 template <
typename MatrixType,
int ScratchLevel>
2772 struct ExtractAndFactorizeTridiags {
2774 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2776 using execution_space =
typename impl_type::execution_space;
2777 using memory_space =
typename impl_type::memory_space;
2779 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2782 using magnitude_type =
typename impl_type::magnitude_type;
2784 using row_matrix_type =
typename impl_type::tpetra_row_matrix_type;
2785 using crs_graph_type =
typename impl_type::tpetra_crs_graph_type;
2787 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2788 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
2790 using size_type_2d_view =
typename impl_type::size_type_2d_view;
2791 using impl_scalar_type_1d_view_tpetra =
typename impl_type::impl_scalar_type_1d_view_tpetra;
2793 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
2794 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2795 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2796 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
2797 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
2798 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
2799 using btdm_scalar_type_2d_view =
typename impl_type::btdm_scalar_type_2d_view;
2800 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
2801 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
2802 using btdm_scalar_type_5d_view =
typename impl_type::btdm_scalar_type_5d_view;
2803 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2804 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2805 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
2806 using local_crs_graph_type =
typename impl_type::local_crs_graph_type;
2807 using colinds_view =
typename local_crs_graph_type::entries_type;
2809 using internal_vector_type =
typename impl_type::internal_vector_type;
2810 static constexpr
int vector_length = impl_type::vector_length;
2811 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
2812 static_assert(vector_length >= internal_vector_length,
"Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2813 static_assert(vector_length % internal_vector_length == 0,
"Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2818 static constexpr
int half_vector_length = impl_type::half_vector_length;
2821 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2822 using member_type =
typename team_policy_type::member_type;
2826 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2827 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2828 const local_ordinal_type max_partsz;
2830 using size_type_1d_view_tpetra = Kokkos::View<size_t *, typename impl_type::node_device_type>;
2831 ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2832 ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2833 ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2835 const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2836 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2837 const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2838 const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2839 const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2840 const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2841 const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2842 const Unmanaged<size_type_1d_view> diag_offsets;
2844 const local_ordinal_type blocksize, blocksize_square;
2846 const magnitude_type tiny;
2847 const local_ordinal_type vector_loop_size;
2849 bool hasBlockCrsMatrix;
2852 ExtractAndFactorizeTridiags(
const BlockTridiags<MatrixType> &btdm_,
2853 const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2856 const magnitude_type &tiny_)
2858 partptr(interf_.partptr)
2859 , lclrow(interf_.lclrow)
2860 , packptr(interf_.packptr)
2861 , packindices_sub(interf_.packindices_sub)
2862 , packptr_sub(interf_.packptr_sub)
2863 , partptr_sub(interf_.partptr_sub)
2864 , part2packrowidx0_sub(interf_.part2packrowidx0_sub)
2865 , packindices_schur(interf_.packindices_schur)
2866 , max_partsz(interf_.max_partsz)
2869 pack_td_ptr(btdm_.pack_td_ptr)
2870 , flat_td_ptr(btdm_.flat_td_ptr)
2871 , pack_td_ptr_schur(btdm_.pack_td_ptr_schur)
2872 , A_colindsub(btdm_.A_colindsub)
2873 , internal_vector_values((internal_vector_type *)btdm_.values.data(),
2874 btdm_.values.extent(0),
2875 btdm_.values.extent(1),
2876 btdm_.values.extent(2),
2877 vector_length / internal_vector_length)
2878 , internal_vector_values_schur((internal_vector_type *)btdm_.values_schur.data(),
2879 btdm_.values_schur.extent(0),
2880 btdm_.values_schur.extent(1),
2881 btdm_.values_schur.extent(2),
2882 vector_length / internal_vector_length)
2883 , e_internal_vector_values((internal_vector_type *)btdm_.e_values.data(),
2884 btdm_.e_values.extent(0),
2885 btdm_.e_values.extent(1),
2886 btdm_.e_values.extent(2),
2887 btdm_.e_values.extent(3),
2888 vector_length / internal_vector_length)
2889 , scalar_values((btdm_scalar_type *)btdm_.values.data(),
2890 btdm_.values.extent(0),
2891 btdm_.values.extent(1),
2892 btdm_.values.extent(2),
2894 , scalar_values_schur((btdm_scalar_type *)btdm_.values_schur.data(),
2895 btdm_.values_schur.extent(0),
2896 btdm_.values_schur.extent(1),
2897 btdm_.values_schur.extent(2),
2899 , e_scalar_values((btdm_scalar_type *)btdm_.e_values.data(),
2900 btdm_.e_values.extent(0),
2901 btdm_.e_values.extent(1),
2902 btdm_.e_values.extent(2),
2903 btdm_.e_values.extent(3),
2905 , d_inv(btdm_.d_inv)
2906 , diag_offsets(btdm_.diag_offsets)
2907 , blocksize(btdm_.values.extent(1))
2908 , blocksize_square(blocksize * blocksize)
2912 , vector_loop_size(vector_length / internal_vector_length) {
2913 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
2914 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
2916 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A_);
2917 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A_);
2919 hasBlockCrsMatrix = !A_bcrs.is_null();
2921 A_block_rowptr = G_->getLocalGraphDevice().row_map;
2922 if (hasBlockCrsMatrix) {
2923 A_values =
const_cast<block_crs_matrix_type *
>(A_bcrs.get())->getValuesDeviceNonConst();
2925 A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2926 A_values = A_crs->getLocalValuesDevice(Tpetra::Access::ReadOnly);
2931 KOKKOS_INLINE_FUNCTION
2933 extract(local_ordinal_type partidx,
2934 local_ordinal_type local_subpartidx,
2935 local_ordinal_type npacks)
const {
2936 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2937 printf(
"extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2939 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2940 const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2941 local_ordinal_type kfs[vector_length] = {};
2942 local_ordinal_type ri0[vector_length] = {};
2943 local_ordinal_type nrows[vector_length] = {};
2945 for (local_ordinal_type vi = 0; vi < npacks; ++vi, ++partidx) {
2946 kfs[vi] = flat_td_ptr(partidx, local_subpartidx);
2947 ri0[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 0);
2948 nrows[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 1) - ri0[vi];
2949 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2950 printf(
"kfs[%d] = %d;\n", vi, kfs[vi]);
2951 printf(
"ri0[%d] = %d;\n", vi, ri0[vi]);
2952 printf(
"nrows[%d] = %d;\n", vi, nrows[vi]);
2955 local_ordinal_type tr_min = 0;
2956 local_ordinal_type tr_max = nrows[0];
2957 if (local_subpartidx % 2 == 1) {
2961 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2962 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2964 for (local_ordinal_type tr = tr_min, j = 0; tr < tr_max; ++tr) {
2965 for (local_ordinal_type e = 0; e < 3; ++e) {
2966 if (hasBlockCrsMatrix) {
2967 const impl_scalar_type *block[vector_length] = {};
2968 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
2969 const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2971 block[vi] = &A_values(Aj * blocksize_square);
2973 const size_type pi = kps + j;
2974 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2975 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2978 for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
2979 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
2980 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2981 auto &v = internal_vector_values(pi, ii, jj, 0);
2982 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
2983 v[vi] =
static_cast<btdm_scalar_type
>(block[vi][idx]);
2988 const size_type pi = kps + j;
2990 for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
2991 const size_type Aj_c = A_colindsub(kfs[vi] + j);
2993 for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
2994 auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr) * blocksize + ii);
2996 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
2997 scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c * blocksize + jj);
3003 if (nrows[0] == 1)
break;
3004 if (local_subpartidx % 2 == 0) {
3005 if (e == 1 && (tr == 0 || tr + 1 == nrows[0]))
break;
3006 for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3007 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr + 1 == nrows[vi])) {
3013 if (e == 0 && (tr == -1 || tr == nrows[0]))
break;
3014 for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3015 if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3025 KOKKOS_INLINE_FUNCTION
3027 extract(
const member_type &member,
3028 const local_ordinal_type &partidxbeg,
3029 local_ordinal_type local_subpartidx,
3030 const local_ordinal_type &npacks,
3031 const local_ordinal_type &vbeg)
const {
3032 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3033 printf(
"extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3035 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3036 local_ordinal_type kfs_vals[internal_vector_length] = {};
3037 local_ordinal_type ri0_vals[internal_vector_length] = {};
3038 local_ordinal_type nrows_vals[internal_vector_length] = {};
3040 const size_type kps = pack_td_ptr(partidxbeg, local_subpartidx);
3041 for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3042 kfs_vals[vi] = flat_td_ptr(partidxbeg + vi, local_subpartidx);
3043 ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 0);
3044 nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 1) - ri0_vals[vi];
3045 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3046 printf(
"kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3047 printf(
"ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3048 printf(
"nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3052 local_ordinal_type j_vals[internal_vector_length] = {};
3054 local_ordinal_type tr_min = 0;
3055 local_ordinal_type tr_max = nrows_vals[0];
3056 if (local_subpartidx % 2 == 1) {
3060 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3061 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3063 for (local_ordinal_type tr = tr_min; tr < tr_max; ++tr) {
3064 for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3065 const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3066 if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows + 1)) {
3067 auto &j = j_vals[vi];
3068 const local_ordinal_type kfs = kfs_vals[vi];
3069 const local_ordinal_type ri0 = ri0_vals[vi];
3070 local_ordinal_type lbeg, lend;
3071 if (local_subpartidx % 2 == 0) {
3072 lbeg = (tr == tr_min ? 1 : 0);
3073 lend = (tr == nrows - 1 ? 2 : 3);
3080 }
else if (tr == nrows) {
3085 if (hasBlockCrsMatrix) {
3086 for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3087 const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3088 const impl_scalar_type *block = &A_values(Aj * blocksize_square);
3089 const size_type pi = kps + j;
3090 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3091 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3093 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3094 [&](
const local_ordinal_type &ii) {
3095 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3096 scalar_values(pi, ii, jj, v) =
static_cast<btdm_scalar_type
>(block[tlb::getFlatIndex(ii, jj, blocksize)]);
3101 for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3102 const size_type Aj_c = A_colindsub(kfs + j);
3103 const size_type pi = kps + j;
3104 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3105 [&](
const local_ordinal_type &ii) {
3106 auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr) * blocksize + ii);
3107 for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3108 scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c * blocksize + jj);
3118 template <
typename AAViewType,
3119 typename WWViewType>
3120 KOKKOS_INLINE_FUNCTION
void
3121 factorize_subline(
const member_type &member,
3122 const local_ordinal_type &i0,
3123 const local_ordinal_type &nrows,
3124 const local_ordinal_type &v,
3125 const AAViewType &AA,
3126 const WWViewType &WW)
const {
3127 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3129 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3130 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3133 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3135 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3136 printf(
"i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3140 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3142 default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3147 local_ordinal_type i = i0;
3148 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
3149 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3150 printf(
"tr = %d, i = %d;\n", tr, i);
3152 B.assign_data(&AA(i + 1, 0, 0, v));
3153 KB::Trsm<member_type,
3154 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3155 default_mode_type, default_algo_type>::invoke(member, one, A, B);
3156 C.assign_data(&AA(i + 2, 0, 0, v));
3157 KB::Trsm<member_type,
3158 KB::Side::Right, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3159 default_mode_type, default_algo_type>::invoke(member, one, A, C);
3160 A.assign_data(&AA(i + 3, 0, 0, v));
3162 member.team_barrier();
3163 KB::Gemm<member_type,
3164 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3165 default_mode_type, default_algo_type>::invoke(member, -one, C, B, one, A);
3167 default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3171 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3172 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, A, W);
3173 KB::SetIdentity<member_type, default_mode_type>::invoke(member, A);
3174 member.team_barrier();
3175 KB::Trsm<member_type,
3176 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3177 default_mode_type, default_algo_type>::invoke(member, one, W, A);
3178 KB::Trsm<member_type,
3179 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3180 default_mode_type, default_algo_type>::invoke(member, one, W, A);
3185 struct ExtractAndFactorizeSubLineTag {};
3186 struct ExtractAndFactorizeFusedJacobiTag {};
3187 struct ExtractBCDTag {};
3188 struct ComputeETag {};
3189 struct ComputeSchurTag {};
3190 struct FactorizeSchurTag {};
3192 KOKKOS_INLINE_FUNCTION
3194 operator()(
const ExtractAndFactorizeSubLineTag &,
const member_type &member)
const {
3196 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3198 const local_ordinal_type subpartidx = packptr_sub(packidx);
3199 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3200 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3201 const local_ordinal_type partidx = subpartidx % n_parts;
3203 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3204 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3205 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3207 internal_vector_scratch_type_3d_view
3208 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3210 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3211 printf(
"rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3212 printf(
"vector_loop_size = %d\n", vector_loop_size);
3215 if (vector_loop_size == 1) {
3216 extract(partidx, local_subpartidx, npacks);
3217 factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3219 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3220 [&](
const local_ordinal_type &v) {
3221 const local_ordinal_type vbeg = v * internal_vector_length;
3222 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3223 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3226 extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3229 member.team_barrier();
3230 factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3235 KOKKOS_INLINE_FUNCTION
3237 operator()(
const ExtractAndFactorizeFusedJacobiTag &,
const member_type &member)
const {
3238 using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3239 using default_mode_type =
typename default_mode_and_algo_type::mode_type;
3240 using default_algo_type =
typename default_mode_and_algo_type::algo_type;
3243 btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3244 btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3245 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3246 const local_ordinal_type nrows = lclrow.extent(0);
3247 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, half_vector_length),
3248 [&](
const local_ordinal_type &v) {
3249 local_ordinal_type row = member.league_rank() * half_vector_length + v;
3251 auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3252 auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3255 const impl_scalar_type *A_diag = A_values.data() + diag_offsets(row);
3258 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3260 W1.data()[i] = A_diag[i];
3263 KB::SetIdentity<member_type, default_mode_type>::invoke(member, W2);
3268 KB::SetIdentity<member_type, default_mode_type>::invoke(member, W1);
3270 member.team_barrier();
3272 KB::LU<member_type, default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, W1, tiny);
3273 member.team_barrier();
3274 KB::Trsm<member_type,
3275 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3276 default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3277 KB::Trsm<member_type,
3278 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3279 default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3280 member.team_barrier();
3282 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3284 auto d_inv_block = &d_inv(row, 0, 0);
3285 d_inv_block[i] = W2.data()[i];
3291 KOKKOS_INLINE_FUNCTION
3293 operator()(
const ExtractBCDTag &,
const member_type &member)
const {
3295 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3296 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3297 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3299 const local_ordinal_type subpartidx = packptr_sub(packidx);
3300 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3301 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3302 const local_ordinal_type partidx = subpartidx % n_parts;
3304 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3308 if (vector_loop_size == 1) {
3309 extract(partidx, local_subpartidx, npacks);
3311 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3312 [&](
const local_ordinal_type &v) {
3313 const local_ordinal_type vbeg = v * internal_vector_length;
3314 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3315 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3316 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3319 extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3323 member.team_barrier();
3325 const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3326 const size_type kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 1;
3328 const local_ordinal_type r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3329 const local_ordinal_type r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3331 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3332 printf(
"Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3336 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3337 Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3339 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3340 Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3343 KOKKOS_INLINE_FUNCTION
3345 operator()(
const ComputeETag &,
const member_type &member)
const {
3347 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3349 const local_ordinal_type subpartidx = packptr_sub(packidx);
3350 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3351 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3352 const local_ordinal_type partidx = subpartidx % n_parts;
3354 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3355 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3356 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
3357 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3358 const local_ordinal_type num_vectors = blocksize;
3362 internal_vector_scratch_type_3d_view
3363 WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3364 if (local_subpartidx == 0) {
3365 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3366 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3368 }
else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
3369 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3370 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3373 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3374 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3375 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3380 KOKKOS_INLINE_FUNCTION
3382 operator()(
const ComputeSchurTag &,
const member_type &member)
const {
3384 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3385 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3386 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3388 const local_ordinal_type subpartidx = packptr_sub(packidx);
3389 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3390 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3391 const local_ordinal_type partidx = subpartidx % n_parts;
3394 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3400 const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
3401 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
3402 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
3404 for (local_ordinal_type i = 0; i < 4; ++i) {
3405 copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3406 Kokkos::subview(internal_vector_values, i0_offset + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3409 member.team_barrier();
3411 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3413 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx) + 1;
3414 const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 2;
3416 const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3417 const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3419 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3421 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3422 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3424 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
3425 for (size_type i = 0; i < pack_td_ptr_schur(partidx, local_subpartidx_schur + 1) - pack_td_ptr_schur(partidx, local_subpartidx_schur); ++i) {
3426 local_ordinal_type e_r, e_c, c_kps;
3428 if (local_subpartidx_schur == 0) {
3433 }
else if (i == 3) {
3437 }
else if (i == 4) {
3449 }
else if (i == 1) {
3453 }
else if (i == 4) {
3457 }
else if (i == 5) {
3466 auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx, local_subpartidx_schur) + i, Kokkos::ALL(), Kokkos::ALL(), v);
3467 auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3468 auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3469 KB::Gemm<member_type,
3470 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3471 default_mode_type, default_algo_type>::invoke(member, -one, C, E, one, S);
3476 KOKKOS_INLINE_FUNCTION
3478 operator()(
const FactorizeSchurTag &,
const member_type &member)
const {
3479 const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3481 const local_ordinal_type subpartidx = packptr_sub(packidx);
3483 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3484 const local_ordinal_type partidx = subpartidx % n_parts;
3486 const local_ordinal_type i0 = pack_td_ptr_schur(partidx, 0);
3487 const local_ordinal_type nrows = 2 * (pack_td_ptr_schur.extent(1) - 1);
3489 internal_vector_scratch_type_3d_view
3490 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3492 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3493 printf(
"FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3496 if (vector_loop_size == 1) {
3497 factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3499 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3500 [&](
const local_ordinal_type &v) {
3501 factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3507 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3508 const local_ordinal_type team_size =
3509 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3510 recommended_team_size(blocksize, vector_length, internal_vector_length);
3511 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3512 shmem_size(blocksize, blocksize, vector_loop_size);
3515 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3516 printf(
"Start ExtractAndFactorizeSubLineTag\n");
3518 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3519 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeSubLineTag>
3520 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3522 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3523 writeBTDValuesToFile(n_parts, scalar_values,
"before.mm");
3525 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3526 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3528 execution_space().fence();
3530 writeBTDValuesToFile(n_parts, scalar_values,
"after.mm");
3531 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3532 printf(
"End ExtractAndFactorizeSubLineTag\n");
3536 if (packindices_schur.extent(1) > 0) {
3538 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3539 printf(
"Start ExtractBCDTag\n");
3541 Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3542 Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3544 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_before_extract.mm");
3547 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3548 Kokkos::TeamPolicy<execution_space, ExtractBCDTag>
3549 policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3551 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3552 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3554 execution_space().fence();
3557 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3558 printf(
"End ExtractBCDTag\n");
3560 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values,
"after_extraction_of_BCD.mm");
3561 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3562 printf(
"Start ComputeETag\n");
3564 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_extract.mm");
3566 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3567 Kokkos::TeamPolicy<execution_space, ComputeETag>
3568 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3570 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3571 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3573 execution_space().fence();
3575 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_compute.mm");
3577 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3578 printf(
"End ComputeETag\n");
3583 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3584 printf(
"Start ComputeSchurTag\n");
3586 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3587 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"before_schur.mm");
3588 Kokkos::TeamPolicy<execution_space, ComputeSchurTag>
3589 policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3591 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3593 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_schur.mm");
3594 execution_space().fence();
3595 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3596 printf(
"End ComputeSchurTag\n");
3601 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3602 printf(
"Start FactorizeSchurTag\n");
3604 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3605 Kokkos::TeamPolicy<execution_space, FactorizeSchurTag>
3606 policy(packindices_schur.extent(0), team_size, vector_loop_size);
3607 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3608 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3610 execution_space().fence();
3611 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_factor_schur.mm");
3612 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3613 printf(
"End FactorizeSchurTag\n");
3618 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3621 void run_fused_jacobi() {
3622 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3623 const local_ordinal_type team_size =
3624 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3625 recommended_team_size(blocksize, half_vector_length, 1);
3626 const local_ordinal_type per_team_scratch =
3627 btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3629 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3630 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3631 policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3633 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3634 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3637 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3644 template <
typename MatrixType>
3646 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3647 const BlockHelperDetails::PartInterface<MatrixType> &interf,
3649 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3650 bool use_fused_jacobi) {
3652 using execution_space =
typename impl_type::execution_space;
3653 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3654 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3655 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3657 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase", NumericPhase);
3659 int blocksize = btdm.values.extent(1);
3662 int scratch_required;
3663 if (!use_fused_jacobi) {
3665 scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3668 scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3671 int max_scratch = team_policy_type::scratch_size_max(0);
3673 if (scratch_required < max_scratch) {
3675 ExtractAndFactorizeTridiags<MatrixType, 0>
function(btdm, interf, A, G, tiny);
3676 if (!use_fused_jacobi)
3679 function.run_fused_jacobi();
3682 ExtractAndFactorizeTridiags<MatrixType, 1>
function(btdm, interf, A, G, tiny);
3683 if (!use_fused_jacobi)
3686 function.run_fused_jacobi();
3688 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3694 template <
typename MatrixType>
3698 using execution_space =
typename impl_type::execution_space;
3699 using memory_space =
typename impl_type::memory_space;
3701 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3703 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3704 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
3705 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3706 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3707 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
3708 using const_impl_scalar_type_2d_view_tpetra =
typename impl_scalar_type_2d_view_tpetra::const_type;
3709 static constexpr
int vector_length = impl_type::vector_length;
3711 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
3715 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3716 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3717 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3718 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3719 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3720 const local_ordinal_type blocksize;
3721 const local_ordinal_type num_vectors;
3724 vector_type_3d_view packed_multivector;
3725 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3727 template <
typename TagType>
3728 KOKKOS_INLINE_FUNCTION
void copy_multivectors(
const local_ordinal_type &j,
3729 const local_ordinal_type &vi,
3730 const local_ordinal_type &pri,
3731 const local_ordinal_type &ri0)
const {
3732 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3733 for (local_ordinal_type i = 0; i < blocksize; ++i)
3734 packed_multivector(pri, i, col)[vi] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3739 const vector_type_3d_view &pmv)
3740 : partptr(interf.partptr)
3741 , packptr(interf.packptr)
3742 , part2packrowidx0(interf.part2packrowidx0)
3743 , part2rowidx0(interf.part2rowidx0)
3744 , lclrow(interf.lclrow)
3745 , blocksize(pmv.extent(1))
3746 , num_vectors(pmv.extent(2))
3747 , packed_multivector(pmv) {}
3750 KOKKOS_INLINE_FUNCTION
3752 operator()(
const local_ordinal_type &packidx)
const {
3753 local_ordinal_type partidx = packptr(packidx);
3754 local_ordinal_type npacks = packptr(packidx + 1) - partidx;
3755 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3757 local_ordinal_type ri0[vector_length] = {};
3758 local_ordinal_type nrows[vector_length] = {};
3759 for (local_ordinal_type v = 0; v < npacks; ++v, ++partidx) {
3760 ri0[v] = part2rowidx0(partidx);
3761 nrows[v] = part2rowidx0(partidx + 1) - ri0[v];
3763 for (local_ordinal_type j = 0; j < nrows[0]; ++j) {
3764 local_ordinal_type cnt = 1;
3765 for (; cnt < npacks && j != nrows[cnt]; ++cnt)
3768 const local_ordinal_type pri = pri0 + j;
3769 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3770 for (local_ordinal_type i = 0; i < blocksize; ++i)
3771 for (local_ordinal_type v = 0; v < npacks; ++v)
3772 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize * lclrow(ri0[v] + j) + i, col));
3776 KOKKOS_INLINE_FUNCTION
3778 operator()(
const member_type &member)
const {
3779 const local_ordinal_type packidx = member.league_rank();
3780 const local_ordinal_type partidx_begin = packptr(packidx);
3781 const local_ordinal_type npacks = packptr(packidx + 1) - partidx_begin;
3782 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3783 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
3784 const local_ordinal_type partidx = partidx_begin + v;
3785 const local_ordinal_type ri0 = part2rowidx0(partidx);
3786 const local_ordinal_type nrows = part2rowidx0(partidx + 1) - ri0;
3789 const local_ordinal_type pri = pri0;
3790 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
3791 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](
const local_ordinal_type &i) {
3792 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize * lclrow(ri0) + i, col));
3796 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
3797 const local_ordinal_type pri = pri0 + j;
3798 for (local_ordinal_type col = 0; col < num_vectors; ++col)
3799 for (local_ordinal_type i = 0; i < blocksize; ++i)
3800 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3806 void run(
const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3807 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3808 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3810 scalar_multivector = scalar_multivector_;
3811 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3812 const local_ordinal_type vl = vector_length;
3813 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3814 Kokkos::parallel_for(
"MultiVectorConverter::TeamPolicy", policy, *
this);
3816 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3817 Kokkos::parallel_for(
"MultiVectorConverter::RangePolicy", policy, *
this);
3819 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3820 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3829 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3830 typedef KB::Mode::Serial mode_type;
3831 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3832 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3833 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3835 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3837 static int recommended_team_size(
const int ,
3844 #if defined(KOKKOS_ENABLE_CUDA)
3845 static inline int SolveTridiagsRecommendedCudaTeamSize(
const int blksize,
3846 const int vector_length,
3847 const int internal_vector_length) {
3848 const int vector_size = vector_length / internal_vector_length;
3849 int total_team_size(0);
3851 total_team_size = 32;
3852 else if (blksize <= 9)
3853 total_team_size = 32;
3854 else if (blksize <= 12)
3855 total_team_size = 96;
3856 else if (blksize <= 16)
3857 total_team_size = 128;
3858 else if (blksize <= 20)
3859 total_team_size = 160;
3861 total_team_size = 160;
3862 return total_team_size / vector_size;
3866 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3867 typedef KB::Mode::Team mode_type;
3868 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3869 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3870 static int recommended_team_size(
const int blksize,
3871 const int vector_length,
3872 const int internal_vector_length) {
3873 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3877 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3878 typedef KB::Mode::Team mode_type;
3879 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3880 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3881 static int recommended_team_size(
const int blksize,
3882 const int vector_length,
3883 const int internal_vector_length) {
3884 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3889 #if defined(KOKKOS_ENABLE_HIP)
3890 static inline int SolveTridiagsRecommendedHIPTeamSize(
const int blksize,
3891 const int vector_length,
3892 const int internal_vector_length) {
3893 const int vector_size = vector_length / internal_vector_length;
3894 int total_team_size(0);
3896 total_team_size = 32;
3897 else if (blksize <= 9)
3898 total_team_size = 32;
3899 else if (blksize <= 12)
3900 total_team_size = 96;
3901 else if (blksize <= 16)
3902 total_team_size = 128;
3903 else if (blksize <= 20)
3904 total_team_size = 160;
3906 total_team_size = 160;
3907 return total_team_size / vector_size;
3911 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3912 typedef KB::Mode::Team mode_type;
3913 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3914 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3915 static int recommended_team_size(
const int blksize,
3916 const int vector_length,
3917 const int internal_vector_length) {
3918 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3922 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3923 typedef KB::Mode::Team mode_type;
3924 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3925 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3926 static int recommended_team_size(
const int blksize,
3927 const int vector_length,
3928 const int internal_vector_length) {
3929 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3934 #if defined(KOKKOS_ENABLE_SYCL)
3935 static inline int SolveTridiagsRecommendedSYCLTeamSize(
const int blksize,
3936 const int vector_length,
3937 const int internal_vector_length) {
3938 const int vector_size = vector_length / internal_vector_length;
3939 int total_team_size(0);
3941 total_team_size = 32;
3942 else if (blksize <= 9)
3943 total_team_size = 32;
3944 else if (blksize <= 12)
3945 total_team_size = 96;
3946 else if (blksize <= 16)
3947 total_team_size = 128;
3948 else if (blksize <= 20)
3949 total_team_size = 160;
3951 total_team_size = 160;
3952 return total_team_size / vector_size;
3956 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
3957 typedef KB::Mode::Team mode_type;
3958 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3959 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3960 static int recommended_team_size(
const int blksize,
3961 const int vector_length,
3962 const int internal_vector_length) {
3963 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3967 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
3968 typedef KB::Mode::Team mode_type;
3969 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3970 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3971 static int recommended_team_size(
const int blksize,
3972 const int vector_length,
3973 const int internal_vector_length) {
3974 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3979 template <
typename MatrixType>
3980 struct SolveTridiags {
3982 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3983 using execution_space =
typename impl_type::execution_space;
3985 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3988 using magnitude_type =
typename impl_type::magnitude_type;
3989 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3990 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
3992 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3993 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
3994 using size_type_2d_view =
typename impl_type::size_type_2d_view;
3996 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3997 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
3998 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
3999 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
4001 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4003 using internal_vector_type =
typename impl_type::internal_vector_type;
4004 static constexpr
int vector_length = impl_type::vector_length;
4005 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
4008 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
4009 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
4012 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4013 using member_type =
typename team_policy_type::member_type;
4017 local_ordinal_type n_subparts_per_part;
4018 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4019 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4020 const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4021 const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4022 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4023 const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4024 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4025 const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4027 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4028 const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4031 const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4034 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4035 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4036 const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4038 internal_vector_type_4d_view X_internal_vector_values_schur;
4040 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4041 const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4043 const local_ordinal_type vector_loop_size;
4046 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4047 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4048 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4050 Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4052 const impl_scalar_type df;
4053 const bool compute_diff;
4056 SolveTridiags(
const BlockHelperDetails::PartInterface<MatrixType> &interf,
4057 const BlockTridiags<MatrixType> &btdm,
4058 const vector_type_3d_view &pmv,
4059 const impl_scalar_type damping_factor,
4060 const bool is_norm_manager_active)
4062 n_subparts_per_part(interf.n_subparts_per_part)
4063 , partptr(interf.partptr)
4064 , packptr(interf.packptr)
4065 , packindices_sub(interf.packindices_sub)
4066 , packindices_schur(interf.packindices_schur)
4067 , part2packrowidx0(interf.part2packrowidx0)
4068 , part2packrowidx0_sub(interf.part2packrowidx0_sub)
4069 , lclrow(interf.lclrow)
4070 , packptr_sub(interf.packptr_sub)
4071 , partptr_sub(interf.partptr_sub)
4072 , pack_td_ptr_schur(btdm.pack_td_ptr_schur)
4075 pack_td_ptr(btdm.pack_td_ptr)
4076 , D_internal_vector_values((internal_vector_type *)btdm.values.data(),
4077 btdm.values.extent(0),
4078 btdm.values.extent(1),
4079 btdm.values.extent(2),
4080 vector_length / internal_vector_length)
4081 , X_internal_vector_values((internal_vector_type *)pmv.data(),
4085 vector_length / internal_vector_length)
4086 , X_internal_scalar_values((btdm_scalar_type *)pmv.data(),
4092 2 * (n_subparts_per_part - 1) * part2packrowidx0_sub.extent(0),
4095 vector_length / internal_vector_length)
4096 , D_internal_vector_values_schur((internal_vector_type *)btdm.values_schur.data(),
4097 btdm.values_schur.extent(0),
4098 btdm.values_schur.extent(1),
4099 btdm.values_schur.extent(2),
4100 vector_length / internal_vector_length)
4101 , e_internal_vector_values((internal_vector_type *)btdm.e_values.data(),
4102 btdm.e_values.extent(0),
4103 btdm.e_values.extent(1),
4104 btdm.e_values.extent(2),
4105 btdm.e_values.extent(3),
4106 vector_length / internal_vector_length)
4107 , vector_loop_size(vector_length / internal_vector_length)
4108 , Y_scalar_multivector()
4110 , df(damping_factor)
4111 , compute_diff(is_norm_manager_active) {}
4115 KOKKOS_INLINE_FUNCTION
4117 copyToFlatMultiVector(
const member_type &member,
4118 const local_ordinal_type partidxbeg,
4119 const local_ordinal_type npacks,
4120 const local_ordinal_type pri0,
4121 const local_ordinal_type v,
4122 const local_ordinal_type blocksize,
4123 const local_ordinal_type num_vectors)
const {
4124 const local_ordinal_type vbeg = v * internal_vector_length;
4125 if (vbeg < npacks) {
4126 local_ordinal_type ri0_vals[internal_vector_length] = {};
4127 local_ordinal_type nrows_vals[internal_vector_length] = {};
4128 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4129 const local_ordinal_type partidx = partidxbeg + vv;
4130 ri0_vals[vi] = partptr(partidx);
4131 nrows_vals[vi] = partptr(partidx + 1) - ri0_vals[vi];
4134 impl_scalar_type z_partial_sum(0);
4135 if (nrows_vals[0] == 1) {
4136 const local_ordinal_type j = 0, pri = pri0;
4138 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4139 const local_ordinal_type ri0 = ri0_vals[vi];
4140 const local_ordinal_type nrows = nrows_vals[vi];
4142 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
4143 [&](
const local_ordinal_type &i) {
4144 const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4145 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4146 impl_scalar_type &y = Y_scalar_multivector(row, col);
4147 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4151 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4152 z_partial_sum += yd_abs * yd_abs;
4160 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows_vals[0]),
4161 [&](
const local_ordinal_type &j) {
4162 const local_ordinal_type pri = pri0 + j;
4163 for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4164 const local_ordinal_type ri0 = ri0_vals[vi];
4165 const local_ordinal_type nrows = nrows_vals[vi];
4167 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4168 for (local_ordinal_type i = 0; i < blocksize; ++i) {
4169 const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4170 impl_scalar_type &y = Y_scalar_multivector(row, col);
4171 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4175 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4176 z_partial_sum += yd_abs * yd_abs;
4185 Z_scalar_vector(member.league_rank()) += z_partial_sum;
4192 template <
typename WWViewType>
4193 KOKKOS_INLINE_FUNCTION
void
4194 solveSingleVector(
const member_type &member,
4195 const local_ordinal_type &blocksize,
4196 const local_ordinal_type &i0,
4197 const local_ordinal_type &r0,
4198 const local_ordinal_type &nrows,
4199 const local_ordinal_type &v,
4200 const WWViewType &WW)
const {
4201 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4203 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4204 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4207 auto A = D_internal_vector_values.data();
4208 auto X = X_internal_vector_values.data();
4211 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4212 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4216 const local_ordinal_type astep = D_internal_vector_values.stride_0();
4217 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
4218 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
4219 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4220 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
4223 A += i0 * astep + v;
4224 X += r0 * xstep + v;
4229 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4232 blocksize, blocksize,
4237 for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
4238 member.team_barrier();
4239 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4241 blocksize, blocksize,
4243 A + 2 * astep, as0, as1,
4246 X + 1 * xstep, xs0);
4247 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4250 blocksize, blocksize,
4252 A + 3 * astep, as0, as1,
4253 X + 1 * xstep, xs0);
4260 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4263 blocksize, blocksize,
4268 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4270 member.team_barrier();
4271 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4273 blocksize, blocksize,
4275 A + 1 * astep, as0, as1,
4278 X - 1 * xstep, xs0);
4279 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4282 blocksize, blocksize,
4285 X - 1 * xstep, xs0);
4291 const local_ordinal_type ws0 = WW.stride_0();
4292 auto W = WW.data() + v;
4293 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
4294 member, blocksize, X, xs0, W, ws0);
4295 member.team_barrier();
4296 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4298 blocksize, blocksize,
4307 template <
typename WWViewType>
4308 KOKKOS_INLINE_FUNCTION
void
4309 solveMultiVector(
const member_type &member,
4310 const local_ordinal_type & ,
4311 const local_ordinal_type &i0,
4312 const local_ordinal_type &r0,
4313 const local_ordinal_type &nrows,
4314 const local_ordinal_type &v,
4315 const WWViewType &WW)
const {
4316 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4318 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4319 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4322 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4323 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4326 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4327 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4330 local_ordinal_type i = i0, r = r0;
4334 KB::Trsm<member_type,
4335 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4336 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4337 for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
4338 A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
4339 X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
4340 member.team_barrier();
4341 KB::Gemm<member_type,
4342 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4343 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4344 A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
4345 KB::Trsm<member_type,
4346 KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4347 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4348 X1.assign_data(X2.data());
4352 KB::Trsm<member_type,
4353 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4354 default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4355 for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4357 A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
4358 X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
4359 member.team_barrier();
4360 KB::Gemm<member_type,
4361 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4362 default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4364 A.assign_data(&D_internal_vector_values(i, 0, 0, v));
4365 KB::Trsm<member_type,
4366 KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4367 default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4368 X1.assign_data(X2.data());
4372 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4373 KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
4374 member.team_barrier();
4375 KB::Gemm<member_type,
4376 KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4377 default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
4382 struct SingleVectorTag {};
4384 struct MultiVectorTag {};
4387 struct SingleVectorSubLineTag {};
4389 struct MultiVectorSubLineTag {};
4391 struct SingleVectorApplyCTag {};
4393 struct MultiVectorApplyCTag {};
4395 struct SingleVectorSchurTag {};
4397 struct MultiVectorSchurTag {};
4399 struct SingleVectorApplyETag {};
4401 struct MultiVectorApplyETag {};
4403 struct SingleVectorCopyToFlatTag {};
4405 struct SingleZeroingTag {};
4408 KOKKOS_INLINE_FUNCTION
void
4409 operator()(
const SingleVectorTag<B> &,
const member_type &member)
const {
4410 const local_ordinal_type packidx = member.league_rank();
4411 const local_ordinal_type partidx = packptr(packidx);
4412 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4413 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4414 const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4415 const local_ordinal_type r0 = part2packrowidx0(partidx);
4416 const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4417 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4418 const local_ordinal_type num_vectors = 1;
4419 internal_vector_scratch_type_3d_view
4420 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4421 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4422 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4424 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4425 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4426 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4431 KOKKOS_INLINE_FUNCTION
void
4432 operator()(
const MultiVectorTag<B> &,
const member_type &member)
const {
4433 const local_ordinal_type packidx = member.league_rank();
4434 const local_ordinal_type partidx = packptr(packidx);
4435 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4436 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4437 const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4438 const local_ordinal_type r0 = part2packrowidx0(partidx);
4439 const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4440 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4441 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4443 internal_vector_scratch_type_3d_view
4444 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4445 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4446 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4448 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4449 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4450 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4455 KOKKOS_INLINE_FUNCTION
void
4456 operator()(
const SingleVectorSubLineTag<B> &,
const member_type &member)
const {
4458 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4460 const local_ordinal_type subpartidx = packptr_sub(packidx);
4461 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4462 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4463 const local_ordinal_type partidx = subpartidx % n_parts;
4465 const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
4466 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4467 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4468 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4469 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4475 internal_vector_scratch_type_3d_view
4476 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4478 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4479 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4484 KOKKOS_INLINE_FUNCTION
void
4485 operator()(
const SingleVectorApplyCTag<B> &,
const member_type &member)
const {
4488 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4490 const local_ordinal_type subpartidx = packptr_sub(packidx);
4491 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4492 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4493 const local_ordinal_type partidx = subpartidx % n_parts;
4494 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4497 const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4498 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4499 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4501 internal_vector_scratch_type_3d_view
4502 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4506 const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
4507 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
4508 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
4513 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4515 const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx) - 2 : 0;
4516 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx + 1) + 1;
4518 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4520 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4521 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4523 if (local_subpartidx == 0) {
4524 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4525 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), 0, v);
4526 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4527 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4529 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4531 blocksize, blocksize,
4533 C.data(), C.stride_0(), C.stride_1(),
4534 v_1.data(), v_1.stride_0(),
4536 v_2.data(), v_2.stride_0());
4538 }
else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4539 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4540 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4541 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4542 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4544 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4546 blocksize, blocksize,
4548 C.data(), C.stride_0(), C.stride_1(),
4549 v_1.data(), v_1.stride_0(),
4551 v_2.data(), v_2.stride_0());
4554 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4556 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), 0, v);
4557 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4558 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4560 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4562 blocksize, blocksize,
4564 C.data(), C.stride_0(), C.stride_1(),
4565 v_1.data(), v_1.stride_0(),
4567 v_2.data(), v_2.stride_0());
4570 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4571 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4572 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4574 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4576 blocksize, blocksize,
4578 C.data(), C.stride_0(), C.stride_1(),
4579 v_1.data(), v_1.stride_0(),
4581 v_2.data(), v_2.stride_0());
4588 KOKKOS_INLINE_FUNCTION
void
4589 operator()(
const SingleVectorSchurTag<B> &,
const member_type &member)
const {
4590 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4592 const local_ordinal_type partidx = packptr_sub(packidx);
4594 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4596 const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx, 0);
4597 const local_ordinal_type nrows = 2 * (n_subparts_per_part - 1);
4599 const local_ordinal_type r0_schur = nrows * member.league_rank();
4601 internal_vector_scratch_type_3d_view
4602 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4604 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4605 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4606 for (local_ordinal_type i = 0; i < 2; ++i) {
4607 copy3DView<local_ordinal_type>(member,
4608 Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4609 Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4613 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4614 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4617 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4618 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4619 for (local_ordinal_type i = 0; i < 2; ++i) {
4620 copy3DView<local_ordinal_type>(member,
4621 Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4622 Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4628 KOKKOS_INLINE_FUNCTION
void
4629 operator()(
const SingleVectorApplyETag<B> &,
const member_type &member)
const {
4630 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4632 const local_ordinal_type subpartidx = packptr_sub(packidx);
4633 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4634 const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4635 const local_ordinal_type partidx = subpartidx % n_parts;
4636 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4638 const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4639 const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4641 internal_vector_scratch_type_3d_view
4642 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4646 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4648 typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4650 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4651 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4653 if (local_subpartidx == 0) {
4654 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4655 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4657 for (local_ordinal_type row = 0; row < nrows; ++row) {
4658 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4659 auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4661 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4663 blocksize, blocksize,
4665 E.data(), E.stride_0(), E.stride_1(),
4666 v_2.data(), v_2.stride_0(),
4668 v_1.data(), v_1.stride_0());
4671 }
else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4672 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4673 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4675 for (local_ordinal_type row = 0; row < nrows; ++row) {
4676 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4677 auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4679 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4681 blocksize, blocksize,
4683 E.data(), E.stride_0(), E.stride_1(),
4684 v_2.data(), v_2.stride_0(),
4686 v_1.data(), v_1.stride_0());
4690 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4692 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4694 for (local_ordinal_type row = 0; row < nrows; ++row) {
4695 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4696 auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4698 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4700 blocksize, blocksize,
4702 E.data(), E.stride_0(), E.stride_1(),
4703 v_2.data(), v_2.stride_0(),
4705 v_1.data(), v_1.stride_0());
4709 auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4711 for (local_ordinal_type row = 0; row < nrows; ++row) {
4712 auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4713 auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4715 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4717 blocksize, blocksize,
4719 E.data(), E.stride_0(), E.stride_1(),
4720 v_2.data(), v_2.stride_0(),
4722 v_1.data(), v_1.stride_0());
4730 KOKKOS_INLINE_FUNCTION
void
4731 operator()(
const SingleVectorCopyToFlatTag<B> &,
const member_type &member)
const {
4732 const local_ordinal_type packidx = member.league_rank();
4733 const local_ordinal_type partidx = packptr(packidx);
4734 const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4735 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4736 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4737 const local_ordinal_type num_vectors = 1;
4739 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const int &v) {
4740 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4745 KOKKOS_INLINE_FUNCTION
void
4746 operator()(
const SingleZeroingTag<B> &,
const member_type &member)
const {
4747 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4748 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4752 void run(
const impl_scalar_type_2d_view_tpetra &Y,
4753 const impl_scalar_type_1d_view &Z) {
4754 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4755 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SolveTridiags", SolveTridiags);
4758 this->Y_scalar_multivector = Y;
4759 this->Z_scalar_vector = Z;
4761 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4762 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4764 const local_ordinal_type team_size =
4765 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4766 recommended_team_size(blocksize, vector_length, internal_vector_length);
4767 const int per_team_scratch = internal_vector_scratch_type_3d_view ::shmem_size(blocksize, num_vectors, vector_loop_size);
4769 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4770 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4771 if (num_vectors == 1) { \
4772 const Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4773 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4774 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4775 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)), *this); \
4777 const Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4778 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4779 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4780 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)), *this); \
4784 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4785 if (num_vectors == 1) { \
4786 if (packindices_schur.extent(1) <= 0) { \
4787 Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4788 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4789 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4790 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4794 Kokkos::TeamPolicy<execution_space, SingleZeroingTag<B>> \
4795 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4796 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4800 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4801 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4802 Kokkos::TeamPolicy<execution_space, SingleVectorSubLineTag<B>> \
4803 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4804 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4805 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4807 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4808 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4811 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4812 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4813 Kokkos::TeamPolicy<execution_space, SingleVectorApplyCTag<B>> \
4814 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4815 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4816 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4818 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4819 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4822 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4823 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4824 Kokkos::TeamPolicy<execution_space, SingleVectorSchurTag<B>> \
4825 policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4826 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4827 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4829 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4830 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4833 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4834 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4835 Kokkos::TeamPolicy<execution_space, SingleVectorApplyETag<B>> \
4836 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4837 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4838 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4840 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4841 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4844 Kokkos::TeamPolicy<execution_space, SingleVectorCopyToFlatTag<B>> \
4845 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4846 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4851 Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4852 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4853 policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4854 Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4859 switch (blocksize) {
4860 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(3);
4861 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(5);
4862 case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(6);
4863 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(7);
4864 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4865 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4866 case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4867 case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4868 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4869 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4870 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4871 case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4872 default: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(0);
4874 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4876 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4877 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4884 template <
typename MatrixType>
4886 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
4887 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
4888 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4889 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
4890 const bool overlap_communication_and_computation,
4892 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
4893 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
4894 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z,
4895 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
4897 const BlockHelperDetails::PartInterface<MatrixType> &interf,
4900 typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work,
4905 const int max_num_sweeps,
4906 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4907 const int check_tol_every) {
4908 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
4911 using node_memory_space =
typename impl_type::node_memory_space;
4912 using local_ordinal_type =
typename impl_type::local_ordinal_type;
4913 using size_type =
typename impl_type::size_type;
4914 using impl_scalar_type =
typename impl_type::impl_scalar_type;
4915 using magnitude_type =
typename impl_type::magnitude_type;
4916 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
4917 using vector_type_1d_view =
typename impl_type::vector_type_1d_view;
4918 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
4919 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
4921 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
4925 "Neither Tpetra importer nor Async importer is null.");
4928 "Maximum number of sweeps must be >= 1.");
4931 const bool is_seq_method_requested = !tpetra_importer.is_null();
4932 const bool is_async_importer_active = !async_importer.is_null();
4933 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4934 const magnitude_type tolerance = tol * tol;
4935 const local_ordinal_type blocksize = btdm.values.extent(1);
4936 const local_ordinal_type num_vectors = Y.getNumVectors();
4937 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4939 const impl_scalar_type zero(0.0);
4941 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4942 "The seq method for applyInverseJacobi, "
4943 <<
"which in any case is for developer use only, "
4944 <<
"does not support norm-based termination.");
4945 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4946 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4948 std::invalid_argument,
4949 "The seq method for applyInverseJacobi, "
4950 <<
"which in any case is for developer use only, "
4951 <<
"only supports memory spaces accessible from host.");
4954 const size_type work_span_required = num_blockrows * num_vectors * blocksize;
4955 if (work.span() < work_span_required)
4956 work = vector_type_1d_view(
"vector workspace 1d view", work_span_required);
4959 const local_ordinal_type W_size = interf.packptr.extent(0) - 1;
4960 if (local_ordinal_type(W.extent(0)) < W_size)
4961 W = impl_scalar_type_1d_view(
"W", W_size);
4963 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4965 if (is_seq_method_requested) {
4966 if (Z.getNumVectors() != Y.getNumVectors())
4967 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors,
false);
4969 if (is_async_importer_active) {
4971 async_importer->createDataBuffer(num_vectors);
4972 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4978 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4979 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4980 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4981 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4982 if (is_y_zero) Kokkos::deep_copy(YY, zero);
4985 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4986 damping_factor, is_norm_manager_active);
4988 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4990 auto A_crs = Teuchos::rcp_dynamic_cast<
const typename impl_type::tpetra_crs_matrix_type>(A);
4991 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const typename impl_type::tpetra_block_crs_matrix_type>(A);
4993 bool hasBlockCrsMatrix = !A_bcrs.is_null();
4996 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
4998 BlockHelperDetails::ComputeResidualVector<MatrixType>
4999 compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5000 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5004 if (is_norm_manager_active)
5005 norm_manager.setCheckFrequency(check_tol_every);
5009 for (; sweep < max_num_sweeps; ++sweep) {
5013 multivector_converter.run(XX);
5015 if (is_seq_method_requested) {
5019 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5020 compute_residual_vector.run(YY, XX, ZZ);
5023 multivector_converter.run(YY);
5027 if (overlap_communication_and_computation || !is_async_importer_active) {
5028 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5030 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
true);
5031 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5032 if (is_async_importer_active) async_importer->cancel();
5035 if (is_async_importer_active) {
5036 async_importer->syncRecv();
5038 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
false);
5041 if (is_async_importer_active)
5042 async_importer->syncExchange(YY);
5043 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
5045 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5053 solve_tridiags.run(YY, W);
5056 if (is_norm_manager_active) {
5058 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5059 if (sweep + 1 == max_num_sweeps) {
5060 norm_manager.ireduce(sweep,
true);
5061 norm_manager.checkDone(sweep + 1, tolerance,
true);
5063 norm_manager.ireduce(sweep);
5071 if (is_norm_manager_active) norm_manager.finalize();
5078 template <
typename MatrixType,
int B>
5079 int applyFusedBlockJacobi_Impl(
5080 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5081 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5082 const bool overlap_communication_and_computation,
5084 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
5085 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
5086 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
5088 const BlockHelperDetails::PartInterface<MatrixType> &interf,
5089 const BlockTridiags<MatrixType> &btdm,
5091 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work,
5096 const int max_num_sweeps,
5097 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5098 const int check_tol_every) {
5100 using local_ordinal_type =
typename impl_type::local_ordinal_type;
5101 using size_type =
typename impl_type::size_type;
5102 using magnitude_type =
typename impl_type::magnitude_type;
5103 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
5104 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
5108 "Neither Tpetra importer nor Async importer is null.");
5111 "Maximum number of sweeps must be >= 1.");
5114 const bool is_async_importer_active = !async_importer.is_null();
5115 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5116 const magnitude_type tolerance = tol * tol;
5117 const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5118 const local_ordinal_type num_vectors = Y.getNumVectors();
5119 const local_ordinal_type num_blockrows = interf.nparts;
5121 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5123 if (is_async_importer_active) {
5125 async_importer->createDataBuffer(num_vectors);
5126 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5130 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5131 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5133 const bool two_pass_residual =
5134 overlap_communication_and_computation && is_async_importer_active;
5139 size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5140 "Local LHS vector (YY) has total size " << YY.extent(0) <<
"x" << YY.extent(1) <<
" = " << YY.extent(0) * YY.extent(1) <<
",\n"
5141 <<
"but expected " << num_blockrows <<
"x" << blocksize <<
"x" << num_vectors <<
" = " << size_t(num_blockrows) * blocksize * num_vectors <<
'\n');
5142 size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5143 if (work.extent(0) < work_required) {
5147 Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5150 if (W.extent(0) != size_t(num_blockrows))
5154 BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5155 functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5156 BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5157 functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5158 BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5159 functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5162 if (is_norm_manager_active)
5163 norm_manager.setCheckFrequency(check_tol_every);
5168 Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5173 for (; sweep < max_num_sweeps; ++sweep) {
5176 functor_solve_only.run(XX, y_buffers[1 - current_y]);
5179 if (overlap_communication_and_computation || !is_async_importer_active) {
5180 if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5181 if (two_pass_residual) {
5184 functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1 - current_y]);
5188 functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5190 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5191 if (is_async_importer_active) async_importer->cancel();
5194 if (is_async_importer_active) {
5195 async_importer->syncRecv();
5197 functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5200 if (is_async_importer_active)
5201 async_importer->syncExchange(y_buffers[current_y]);
5202 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
5204 functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5209 if (is_norm_manager_active) {
5210 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5211 if (sweep + 1 == max_num_sweeps) {
5212 norm_manager.ireduce(sweep,
true);
5213 norm_manager.checkDone(sweep + 1, tolerance,
true);
5215 norm_manager.ireduce(sweep);
5220 current_y = 1 - current_y;
5222 if (current_y == 1) {
5224 Kokkos::deep_copy(YY, y_doublebuf);
5228 if (is_norm_manager_active) norm_manager.finalize();
5235 template <
typename MatrixType>
5237 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5238 const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5239 const bool overlap_communication_and_computation,
5241 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
5242 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
5243 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
5245 const BlockHelperDetails::PartInterface<MatrixType> &interf,
5248 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work,
5253 const int max_num_sweeps,
5254 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5255 const int check_tol_every) {
5256 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5257 int blocksize = btdm.d_inv.extent(1);
5259 #define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) \
5261 sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5262 tpetra_importer, async_importer, overlap_communication_and_computation, \
5263 X, Y, W, interf, btdm, amd, work, \
5264 norm_manager, damping_factor, is_y_zero, \
5265 max_num_sweeps, tol, check_tol_every); \
5268 switch (blocksize) {
5269 case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(3);
5270 case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(5);
5271 case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(7);
5272 case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(9);
5273 case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5274 case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5275 case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5276 case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5277 case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5278 default: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(0);
5280 #undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5285 template <
typename MatrixType>
5288 using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5289 using block_tridiags_type = BlockTridiags<MatrixType>;
5292 using async_import_type = AsyncableImport<MatrixType>;
5299 bool overlap_communication_and_computation;
5302 mutable typename impl_type::tpetra_multivector_type Z;
5303 mutable typename impl_type::impl_scalar_type_1d_view W;
5306 part_interface_type part_interface;
5307 block_tridiags_type block_tridiags;
5311 bool use_fused_jacobi;
5314 mutable typename impl_type::vector_type_1d_view work;
5316 mutable typename impl_type::impl_scalar_type_1d_view work_flat;
5317 mutable norm_manager_type norm_manager;
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:141
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5236
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3645
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1865
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type >> &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1044
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:889
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1613
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:346
Definition: Ifpack2_BlockHelper.hpp:377
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:97
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:283
Definition: Ifpack2_BlockHelper.hpp:211
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2215
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:165
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:236
Definition: Ifpack2_BlockHelper.hpp:270
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:4885
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1545
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3695