Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockTriDiContainer_impl.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
12 
13 //#define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
14 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
15 
17 
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
21 
22 #include <Kokkos_ArithTraits.hpp>
23 #include <KokkosBatched_Util.hpp>
24 #include <KokkosBatched_Vector.hpp>
25 #include <KokkosBatched_Copy_Decl.hpp>
26 #include <KokkosBatched_Copy_Impl.hpp>
27 #include <KokkosBatched_AddRadial_Decl.hpp>
28 #include <KokkosBatched_AddRadial_Impl.hpp>
29 #include <KokkosBatched_SetIdentity_Decl.hpp>
30 #include <KokkosBatched_SetIdentity_Impl.hpp>
31 #include <KokkosBatched_Gemm_Decl.hpp>
32 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
33 #include <KokkosBatched_Gemm_Team_Impl.hpp>
34 #include <KokkosBatched_Gemv_Decl.hpp>
35 #include <KokkosBatched_Gemv_Team_Impl.hpp>
36 #include <KokkosBatched_Trsm_Decl.hpp>
37 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
38 #include <KokkosBatched_Trsm_Team_Impl.hpp>
39 #include <KokkosBatched_Trsv_Decl.hpp>
40 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
41 #include <KokkosBatched_Trsv_Team_Impl.hpp>
42 #include <KokkosBatched_LU_Decl.hpp>
43 #include <KokkosBatched_LU_Serial_Impl.hpp>
44 #include <KokkosBatched_LU_Team_Impl.hpp>
45 
46 #include <KokkosBlas1_nrm1.hpp>
47 #include <KokkosBlas1_nrm2.hpp>
48 
49 #include <memory>
50 
51 #include "Ifpack2_BlockHelper.hpp"
52 #include "Ifpack2_BlockComputeResidualVector.hpp"
53 #include "Ifpack2_BlockComputeResidualAndSolve.hpp"
54 
55 //#include <KokkosBlas2_gemv.hpp>
56 
57 // need to interface this into cmake variable (or only use this flag when it is necessary)
58 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
59 //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
60 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
61 #include "cuda_profiler_api.h"
62 #endif
63 
64 // I am not 100% sure about the mpi 3 on cuda
65 #if MPI_VERSION >= 3
66 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
67 #endif
68 
69 // ::: Experiments :::
70 // define either pinned memory or cudamemory for mpi
71 // if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
72 // if defined, this use pinned memory instead of device pointer
73 // by default, we enable pinned memory
74 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
75 //#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
76 
77 // if defined, all views are allocated on cuda space intead of cuda uvm space
78 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
79 
80 // if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
81 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
82 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
83 #endif
84 
85 // if defined, it uses multiple execution spaces
86 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
87 
88 namespace Ifpack2 {
89 
90 namespace BlockTriDiContainerDetails {
91 
92 namespace KB = KokkosBatched;
93 
97 using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
98 
99 template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
100 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
101  MemoryTraitsType::is_random_access |
102  flag>;
103 
104 template <typename ViewType>
105 using Unmanaged = Kokkos::View<typename ViewType::data_type,
106  typename ViewType::array_layout,
107  typename ViewType::device_type,
108  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
109 template <typename ViewType>
110 using Atomic = Kokkos::View<typename ViewType::data_type,
111  typename ViewType::array_layout,
112  typename ViewType::device_type,
113  MemoryTraits<typename ViewType::memory_traits, Kokkos::Atomic>>;
114 template <typename ViewType>
115 using Const = Kokkos::View<typename ViewType::const_data_type,
116  typename ViewType::array_layout,
117  typename ViewType::device_type,
118  typename ViewType::memory_traits>;
119 template <typename ViewType>
120 using ConstUnmanaged = Const<Unmanaged<ViewType>>;
121 
122 template <typename ViewType>
123 using AtomicUnmanaged = Atomic<Unmanaged<ViewType>>;
124 
125 template <typename ViewType>
126 using Unmanaged = Kokkos::View<typename ViewType::data_type,
127  typename ViewType::array_layout,
128  typename ViewType::device_type,
129  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
130 
131 template <typename ViewType>
132 using Scratch = Kokkos::View<typename ViewType::data_type,
133  typename ViewType::array_layout,
134  typename ViewType::execution_space::scratch_memory_space,
135  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged>>;
136 
140 template <typename T>
141 struct BlockTridiagScalarType { typedef T type; };
142 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
143 template <>
144 struct BlockTridiagScalarType<double> { typedef float type; };
145 // template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
146 #endif
147 
148 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
149 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
150  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
151 
152 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
153  { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStop()); }
154 #else
155 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
157 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
158 #endif
159 
163 template <typename MatrixType>
165 createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
166  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
168  using tpetra_map_type = typename impl_type::tpetra_map_type;
169  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
170  using tpetra_import_type = typename impl_type::tpetra_import_type;
171  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
172  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
173 
174  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
175  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
176 
177  bool hasBlockCrsMatrix = !A_bcrs.is_null();
178 
179  // This is OK here to use the graph of the A_crs matrix and a block size of 1
180  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
181 
182  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
183  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
184  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap(), blocksize)));
185  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
186  return Teuchos::rcp(new tpetra_import_type(src, tgt));
187 }
188 
189 // Partial replacement for forward-mode MultiVector::doImport.
190 // Permits overlapped communication and computation, but also supports sync'ed.
191 // I'm finding that overlapped comm/comp can give quite poor performance on some
192 // platforms, so we can't just use it straightforwardly always.
193 
194 template <typename MatrixType>
195 struct AsyncableImport {
196  public:
198 
199  private:
203 #if !defined(HAVE_IFPACK2_MPI)
204  typedef int MPI_Request;
205  typedef int MPI_Comm;
206 #endif
207  using scalar_type = typename impl_type::scalar_type;
210 
211  static int isend(const MPI_Comm comm, const char *buf, int count, int dest, int tag, MPI_Request *ireq) {
212 #ifdef HAVE_IFPACK2_MPI
213  MPI_Request ureq;
214  int ret = MPI_Isend(const_cast<char *>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
215  if (ireq == NULL) MPI_Request_free(&ureq);
216  return ret;
217 #else
218  return 0;
219 #endif
220  }
221 
222  static int irecv(const MPI_Comm comm, char *buf, int count, int src, int tag, MPI_Request *ireq) {
223 #ifdef HAVE_IFPACK2_MPI
224  MPI_Request ureq;
225  int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
226  if (ireq == NULL) MPI_Request_free(&ureq);
227  return ret;
228 #else
229  return 0;
230 #endif
231  }
232 
233  static int waitany(int count, MPI_Request *reqs, int *index) {
234 #ifdef HAVE_IFPACK2_MPI
235  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
236 #else
237  return 0;
238 #endif
239  }
240 
241  static int waitall(int count, MPI_Request *reqs) {
242 #ifdef HAVE_IFPACK2_MPI
243  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
244 #else
245  return 0;
246 #endif
247  }
248 
249  public:
250  using tpetra_map_type = typename impl_type::tpetra_map_type;
251  using tpetra_import_type = typename impl_type::tpetra_import_type;
252 
253  using local_ordinal_type = typename impl_type::local_ordinal_type;
254  using global_ordinal_type = typename impl_type::global_ordinal_type;
255  using size_type = typename impl_type::size_type;
256  using impl_scalar_type = typename impl_type::impl_scalar_type;
257 
258  using int_1d_view_host = Kokkos::View<int *, Kokkos::HostSpace>;
259  using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type *, Kokkos::HostSpace>;
260 
261  using execution_space = typename impl_type::execution_space;
262  using memory_space = typename impl_type::memory_space;
263  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
264  using size_type_1d_view = typename impl_type::size_type_1d_view;
265  using size_type_1d_view_host = Kokkos::View<size_type *, Kokkos::HostSpace>;
266 
267 #if defined(KOKKOS_ENABLE_CUDA)
268  using impl_scalar_type_1d_view =
269  typename std::conditional<std::is_same<execution_space, Kokkos::Cuda>::value,
270 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
271  Kokkos::View<impl_scalar_type *, Kokkos::CudaHostPinnedSpace>,
272 #elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
273  Kokkos::View<impl_scalar_type *, Kokkos::CudaSpace>,
274 #else // no experimental macros are defined
275  typename impl_type::impl_scalar_type_1d_view,
276 #endif
277  typename impl_type::impl_scalar_type_1d_view>::type;
278 #else
279  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
280 #endif
281  using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type *, Kokkos::HostSpace>;
282  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
283  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
284 
285 #ifdef HAVE_IFPACK2_MPI
286  MPI_Comm comm;
287 #endif
288 
289  impl_scalar_type_2d_view_tpetra remote_multivector;
290  local_ordinal_type blocksize;
291 
292  template <typename T>
293  struct SendRecvPair {
294  T send, recv;
295  };
296 
297  // (s)end and (r)eceive data:
298  SendRecvPair<int_1d_view_host> pids; // mpi ranks
299  SendRecvPair<std::vector<MPI_Request>> reqs; // MPI_Request is pointer, cannot use kokkos view
300  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
301  SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
302  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
303  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
304  SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
305 
306  local_ordinal_type_1d_view dm2cm; // permutation
307 
308 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
309  using exec_instance_1d_std_vector = std::vector<execution_space>;
310  exec_instance_1d_std_vector exec_instances;
311 #endif
312 
313  // for cuda
314  public:
315  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
316  const size_type_1d_view &offs) {
317  // wrap lens to kokkos view and deep copy to device
318  Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
319  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
320 
321  // exclusive scan
322  const Kokkos::RangePolicy<execution_space> policy(0, offs.extent(0));
323  const local_ordinal_type lens_size = lens_device.extent(0);
324  Kokkos::parallel_scan(
325  "AsyncableImport::RangePolicy::setOffsetValues",
326  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
327  if (final)
328  offs(i) = update;
329  update += (i < lens_size ? lens_device[i] : 0);
330  });
331  }
332 
333  void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
334  const size_type_1d_view_host &offs) {
335  // wrap lens to kokkos view and deep copy to device
336  Kokkos::View<size_t *, Kokkos::HostSpace> lens_host(const_cast<size_t *>(lens.getRawPtr()), lens.size());
337  const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
338 
339  // exclusive scan
340  offs(0) = 0;
341  for (local_ordinal_type i = 1, iend = offs.extent(0); i < iend; ++i) {
342  offs(i) = offs(i - 1) + lens[i - 1];
343  }
344  }
345 
346  private:
347  void createMpiRequests(const tpetra_import_type &import) {
348  Tpetra::Distributor &distributor = import.getDistributor();
349 
350  // copy pids from distributor
351  const auto pids_from = distributor.getProcsFrom();
352  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
353  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int) * pids.recv.extent(0));
354 
355  const auto pids_to = distributor.getProcsTo();
356  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
357  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int) * pids.send.extent(0));
358 
359  // mpi requests
360  reqs.recv.resize(pids.recv.extent(0));
361  memset(reqs.recv.data(), 0, reqs.recv.size() * sizeof(MPI_Request));
362  reqs.send.resize(pids.send.extent(0));
363  memset(reqs.send.data(), 0, reqs.send.size() * sizeof(MPI_Request));
364 
365  // construct offsets
366 #if 0
367  const auto lengths_to = distributor.getLengthsTo();
368  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
369 
370  const auto lengths_from = distributor.getLengthsFrom();
371  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
372 
373  setOffsetValues(lengths_to, offset.send);
374  offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
375 
376  setOffsetValues(lengths_from, offset.recv);
377  offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
378 #else
379  const auto lengths_to = distributor.getLengthsTo();
380  offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
381 
382  const auto lengths_from = distributor.getLengthsFrom();
383  offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
384 
385  setOffsetValuesHost(lengths_to, offset_host.send);
386  // offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
387 
388  setOffsetValuesHost(lengths_from, offset_host.recv);
389  // offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
390 #endif
391  }
392 
393  void createSendRecvIDs(const tpetra_import_type &import) {
394  // For each remote PID, the list of LIDs to receive.
395  const auto remote_lids = import.getRemoteLIDs();
396  const local_ordinal_type_1d_view_host
397  remote_lids_view_host(const_cast<local_ordinal_type *>(remote_lids.getRawPtr()), remote_lids.size());
398  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
399  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
400 
401  // For each export PID, the list of LIDs to send.
402  auto epids = import.getExportPIDs();
403  auto elids = import.getExportLIDs();
404  TEUCHOS_ASSERT(epids.size() == elids.size());
405  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
406  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
407 
408  // naive search (not sure if pids or epids are sorted)
409  for (local_ordinal_type cnt = 0, i = 0, iend = pids.send.extent(0); i < iend; ++i) {
410  const auto pid_send_value = pids.send[i];
411  for (local_ordinal_type j = 0, jend = epids.size(); j < jend; ++j)
412  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
413  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i + 1]);
414  }
415  Kokkos::deep_copy(lids.send, lids_send_host);
416  }
417 
418  void createExecutionSpaceInstances() {
419 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
420  // The following line creates 8 streams:
421 #if KOKKOS_VERSION >= 40699
422  exec_instances =
423  Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
424 #else
425  exec_instances =
426  Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
427 #endif
428 #endif
429  }
430 
431  public:
432  // for cuda, all tag types are public
433  struct ToBuffer {};
434  struct ToMultiVector {};
435 
436  AsyncableImport(const Teuchos::RCP<const tpetra_map_type> &src_map,
438  const local_ordinal_type blocksize_,
439  const local_ordinal_type_1d_view dm2cm_) {
440  blocksize = blocksize_;
441  dm2cm = dm2cm_;
442 
443 #ifdef HAVE_IFPACK2_MPI
444  comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
445 #endif
446  const tpetra_import_type import(src_map, tgt_map);
447 
448  createMpiRequests(import);
449  createSendRecvIDs(import);
450  createExecutionSpaceInstances();
451  }
452 
453  void createDataBuffer(const local_ordinal_type &num_vectors) {
454  const size_type extent_0 = lids.recv.extent(0) * blocksize;
455  const size_type extent_1 = num_vectors;
456  if (remote_multivector.extent(0) == extent_0 &&
457  remote_multivector.extent(1) == extent_1) {
458  // skip
459  } else {
460  remote_multivector =
461  impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
462 
463  const auto send_buffer_size = offset_host.send[offset_host.send.extent(0) - 1] * blocksize * num_vectors;
464  const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0) - 1] * blocksize * num_vectors;
465 
466  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
467  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
468 
469  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
470  buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
471  buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
472  }
473  }
474  }
475 
476  void cancel() {
477 #ifdef HAVE_IFPACK2_MPI
478  waitall(reqs.recv.size(), reqs.recv.data());
479  waitall(reqs.send.size(), reqs.send.data());
480 #endif
481  }
482 
483  // ======================================================================
484  // Async version using execution space instances
485  // ======================================================================
486 
487 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
488  template <typename PackTag>
489  static void copy(const local_ordinal_type_1d_view &lids_,
490  const impl_scalar_type_1d_view &buffer_,
491  const local_ordinal_type ibeg_,
492  const local_ordinal_type iend_,
493  const impl_scalar_type_2d_view_tpetra &multivector_,
494  const local_ordinal_type blocksize_,
495  const execution_space &exec_instance_) {
496  const local_ordinal_type num_vectors = multivector_.extent(1);
497  const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
498  const local_ordinal_type idiff = iend_ - ibeg_;
499  const auto abase = buffer_.data() + mv_blocksize * ibeg_;
500 
501  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
502  local_ordinal_type vector_size(0);
503  if (blocksize_ <= 4)
504  vector_size = 4;
505  else if (blocksize_ <= 8)
506  vector_size = 8;
507  else if (blocksize_ <= 16)
508  vector_size = 16;
509  else
510  vector_size = 32;
511 
512  const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
513  const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
514  Kokkos::parallel_for( //"AsyncableImport::TeamPolicy::copyViaCudaStream",
515  Kokkos::Experimental::require(policy, work_item_property),
516  KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
517  const local_ordinal_type i = member.league_rank();
518  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
519  auto aptr = abase + blocksize_ * (i + idiff * j);
520  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
521  if (std::is_same<PackTag, ToBuffer>::value)
522  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
523  aptr[k] = bptr[k];
524  });
525  else
526  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
527  bptr[k] = aptr[k];
528  });
529  });
530  });
531  }
532 
533  void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
534  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
535 
536 #ifdef HAVE_IFPACK2_MPI
537  // constants and reallocate data buffers if necessary
538  const local_ordinal_type num_vectors = mv.extent(1);
539  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
540 
541  // 0. post receive async
542  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
543  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
544  irecv(comm,
545  reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
546  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
547  pids.recv[i],
548  42,
549  &reqs.recv[i]);
550  } else {
551  irecv(comm,
552  reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
553  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
554  pids.recv[i],
555  42,
556  &reqs.recv[i]);
557  }
558  }
559 
561  execution_space().fence();
562 
563  // 1. async memcpy
564  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
565  // 1.0. enqueue pack buffer
566  if (i < 8) exec_instances[i % 8].fence();
567  copy<ToBuffer>(lids.send, buffer.send,
568  offset_host.send(i), offset_host.send(i + 1),
569  mv, blocksize,
570  // execution_space());
571  exec_instances[i % 8]);
572  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
573  // if (i<8) exec_instances[i%8].fence();
574  const local_ordinal_type num_vectors = mv.extent(1);
575  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
576 
577  Kokkos::deep_copy(exec_instances[i % 8],
578  Kokkos::subview(buffer_host.send,
579  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
580  offset_host.send(i) * mv_blocksize,
581  offset_host.send(i + 1) * mv_blocksize)),
582  Kokkos::subview(buffer.send,
583  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
584  offset_host.send(i) * mv_blocksize,
585  offset_host.send(i + 1) * mv_blocksize)));
586  }
587  }
589  // execution_space().fence();
590  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.send.extent(0)); ++i) {
591  // 1.1. sync the stream and isend
592  if (i < 8) exec_instances[i % 8].fence();
593  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
594  isend(comm,
595  reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
596  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
597  pids.send[i],
598  42,
599  &reqs.send[i]);
600  } else {
601  isend(comm,
602  reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
603  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
604  pids.send[i],
605  42,
606  &reqs.send[i]);
607  }
608  }
609 
610  // 2. poke communication
611  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
612  int flag;
613  MPI_Status stat;
614  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
615  }
616 #endif // HAVE_IFPACK2_MPI
617  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
618  }
619 
620  void syncRecvVar1() {
621  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
622 #ifdef HAVE_IFPACK2_MPI
623  // 0. wait for receive async.
624  for (local_ordinal_type i = 0; i < static_cast<local_ordinal_type>(pids.recv.extent(0)); ++i) {
625  local_ordinal_type idx = i;
626 
627  // 0.0. wait any
628  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
629 
630  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
631  const local_ordinal_type num_vectors = remote_multivector.extent(1);
632  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
633 
634  Kokkos::deep_copy(
635  Kokkos::subview(buffer.recv,
636  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
637  offset_host.recv(idx) * mv_blocksize,
638  offset_host.recv(idx + 1) * mv_blocksize)),
639  Kokkos::subview(buffer_host.recv,
640  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
641  offset_host.recv(idx) * mv_blocksize,
642  offset_host.recv(idx + 1) * mv_blocksize)));
643  }
644 
645  // 0.1. unpack data after data is moved into a device
646  copy<ToMultiVector>(lids.recv, buffer.recv,
647  offset_host.recv(idx), offset_host.recv(idx + 1),
648  remote_multivector, blocksize,
649  exec_instances[idx % 8]);
650  }
651 
652  // 1. fire up all cuda events
653  Kokkos::fence();
654 
655  // 2. cleanup all open comm
656  waitall(reqs.send.size(), reqs.send.data());
657 #endif // HAVE_IFPACK2_MPI
658  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
659  }
660 #endif // defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
661 
662  // ======================================================================
663  // Generic version without using execution space instances
664  // - only difference between device and host architecture is on using team
665  // or range policies.
666  // ======================================================================
667  template <typename PackTag>
668  static void copy(const local_ordinal_type_1d_view &lids_,
669  const impl_scalar_type_1d_view &buffer_,
670  const local_ordinal_type &ibeg_,
671  const local_ordinal_type &iend_,
672  const impl_scalar_type_2d_view_tpetra &multivector_,
673  const local_ordinal_type blocksize_) {
674  const local_ordinal_type num_vectors = multivector_.extent(1);
675  const local_ordinal_type mv_blocksize = blocksize_ * num_vectors;
676  const local_ordinal_type idiff = iend_ - ibeg_;
677  const auto abase = buffer_.data() + mv_blocksize * ibeg_;
678  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
679  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
680  local_ordinal_type vector_size(0);
681  if (blocksize_ <= 4)
682  vector_size = 4;
683  else if (blocksize_ <= 8)
684  vector_size = 8;
685  else if (blocksize_ <= 16)
686  vector_size = 16;
687  else
688  vector_size = 32;
689  const team_policy_type policy(idiff, 1, vector_size);
690  Kokkos::parallel_for(
691  "AsyncableImport::TeamPolicy::copy",
692  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
693  const local_ordinal_type i = member.league_rank();
694  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, num_vectors), [&](const local_ordinal_type &j) {
695  auto aptr = abase + blocksize_ * (i + idiff * j);
696  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
697  if (std::is_same<PackTag, ToBuffer>::value)
698  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
699  aptr[k] = bptr[k];
700  });
701  else
702  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize_), [&](const local_ordinal_type &k) {
703  bptr[k] = aptr[k];
704  });
705  });
706  });
707  } else {
708  const Kokkos::RangePolicy<execution_space> policy(0, idiff * num_vectors);
709  Kokkos::parallel_for(
710  "AsyncableImport::RangePolicy::copy",
711  policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
712  const local_ordinal_type i = ij % idiff;
713  const local_ordinal_type j = ij / idiff;
714  auto aptr = abase + blocksize_ * (i + idiff * j);
715  auto bptr = &multivector_(blocksize_ * lids_(i + ibeg_), j);
716  auto from = std::is_same<PackTag, ToBuffer>::value ? bptr : aptr;
717  auto to = std::is_same<PackTag, ToBuffer>::value ? aptr : bptr;
718  memcpy(to, from, sizeof(impl_scalar_type) * blocksize_);
719  });
720  }
721  }
722 
726  void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
727  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
728 
729 #ifdef HAVE_IFPACK2_MPI
730  // constants and reallocate data buffers if necessary
731  const local_ordinal_type num_vectors = mv.extent(1);
732  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
733 
734  // receive async
735  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
736  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
737  irecv(comm,
738  reinterpret_cast<char *>(buffer.recv.data() + offset_host.recv[i] * mv_blocksize),
739  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
740  pids.recv[i],
741  42,
742  &reqs.recv[i]);
743  } else {
744  irecv(comm,
745  reinterpret_cast<char *>(buffer_host.recv.data() + offset_host.recv[i] * mv_blocksize),
746  (offset_host.recv[i + 1] - offset_host.recv[i]) * mv_blocksize * sizeof(impl_scalar_type),
747  pids.recv[i],
748  42,
749  &reqs.recv[i]);
750  }
751  }
752 
753  // send async
754  for (local_ordinal_type i = 0, iend = pids.send.extent(0); i < iend; ++i) {
755  copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i + 1),
756  mv, blocksize);
757  Kokkos::fence();
758  if (Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
759  isend(comm,
760  reinterpret_cast<const char *>(buffer.send.data() + offset_host.send[i] * mv_blocksize),
761  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
762  pids.send[i],
763  42,
764  &reqs.send[i]);
765  } else {
766  Kokkos::deep_copy(
767  Kokkos::subview(buffer_host.send,
768  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
769  offset_host.send(i) * mv_blocksize,
770  offset_host.send(i + 1) * mv_blocksize)),
771  Kokkos::subview(buffer.send,
772  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
773  offset_host.send(i) * mv_blocksize,
774  offset_host.send(i + 1) * mv_blocksize)));
775  isend(comm,
776  reinterpret_cast<const char *>(buffer_host.send.data() + offset_host.send[i] * mv_blocksize),
777  (offset_host.send[i + 1] - offset_host.send[i]) * mv_blocksize * sizeof(impl_scalar_type),
778  pids.send[i],
779  42,
780  &reqs.send[i]);
781  }
782  }
783 
784  // I find that issuing an Iprobe seems to nudge some MPIs into action,
785  // which helps with overlapped comm/comp performance.
786  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
787  int flag;
788  MPI_Status stat;
789  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
790  }
791 #endif
792  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
793  }
794 
795  void syncRecvVar0() {
796  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
797 #ifdef HAVE_IFPACK2_MPI
798  // receive async.
799  for (local_ordinal_type i = 0, iend = pids.recv.extent(0); i < iend; ++i) {
800  local_ordinal_type idx = i;
801  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
802  if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
803  const local_ordinal_type num_vectors = remote_multivector.extent(1);
804  const local_ordinal_type mv_blocksize = blocksize * num_vectors;
805  Kokkos::deep_copy(
806  Kokkos::subview(buffer.recv,
807  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
808  offset_host.recv(idx) * mv_blocksize,
809  offset_host.recv(idx + 1) * mv_blocksize)),
810  Kokkos::subview(buffer_host.recv,
811  Kokkos::pair<local_ordinal_type, local_ordinal_type>(
812  offset_host.recv(idx) * mv_blocksize,
813  offset_host.recv(idx + 1) * mv_blocksize)));
814  }
815  copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx + 1),
816  remote_multivector, blocksize);
817  }
818  // wait on the sends to match all Isends with a cleanup operation.
819  waitall(reqs.send.size(), reqs.send.data());
820 #endif
821  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
822  }
823 
827  void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
828 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
829 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
830  asyncSendRecvVar1(mv);
831 #else
832  asyncSendRecvVar0(mv);
833 #endif
834 #else
835  asyncSendRecvVar0(mv);
836 #endif
837  }
838  void syncRecv() {
839 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
840 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
841  syncRecvVar1();
842 #else
843  syncRecvVar0();
844 #endif
845 #else
846  syncRecvVar0();
847 #endif
848  }
849 
850  void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
851  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
852  asyncSendRecv(mv);
853  syncRecv();
854  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
855  }
856 
857  impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
858 };
859 
860 template <typename ViewType1, typename ViewType2>
861 struct are_same_struct {
862  ViewType1 keys1;
863  ViewType2 keys2;
864 
865  are_same_struct(ViewType1 keys1_, ViewType2 keys2_)
866  : keys1(keys1_)
867  , keys2(keys2_) {}
868  KOKKOS_INLINE_FUNCTION
869  void operator()(int i, unsigned int &count) const {
870  if (keys1(i) != keys2(i)) count++;
871  }
872 };
873 
874 template <typename ViewType1, typename ViewType2>
875 bool are_same(ViewType1 keys1, ViewType2 keys2) {
876  unsigned int are_same_ = 0;
877 
878  Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
879  are_same_struct(keys1, keys2),
880  are_same_);
881  return are_same_ == 0;
882 }
883 
887 template <typename MatrixType>
889 createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
890  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
892  using tpetra_map_type = typename impl_type::tpetra_map_type;
893  using local_ordinal_type = typename impl_type::local_ordinal_type;
894  using global_ordinal_type = typename impl_type::global_ordinal_type;
895  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
896  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
897  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
898  using global_indices_array_device_type = Kokkos::View<const global_ordinal_type *, typename tpetra_map_type::device_type>;
899 
900  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
901  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
902 
903  bool hasBlockCrsMatrix = !A_bcrs.is_null();
904 
905  // This is OK here to use the graph of the A_crs matrix and a block size of 1
906  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
907 
908  const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
909  const auto domain_map = g.getDomainMap();
910  const auto column_map = g.getColMap();
911 
912  std::vector<global_ordinal_type> gids;
913 
914  Kokkos::Subview<global_indices_array_device_type, std::pair<int, int>> column_map_global_iD_last;
915 
916  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
917  {
918  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
919 
920  global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
921  global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
922 
923  if (are_same(domain_map_global_iD, column_map_global_iD)) {
924  // this should be the most likely path
925  separate_remotes = true;
926  need_owned_permutation = false;
927 
928  column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
929  std::pair<int, int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
930  } else {
931  // This loop is relatively expensive
932  for (size_t i = 0; i < column_map->getLocalNumElements(); ++i) {
933  const global_ordinal_type gid = column_map->getGlobalElement(i);
934  if (!domain_map->isNodeGlobalElement(gid)) {
935  found_first = true;
936  gids.push_back(gid);
937  } else if (found_first) {
938  separate_remotes = false;
939  break;
940  }
941  if (!found_first && !need_owned_permutation &&
942  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
943  // The owned part of the domain and column maps are different
944  // orderings. We *could* do a super efficient impl of this case in the
945  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
946  // really, if a caller cares about speed, they wouldn't make different
947  // local permutations like this. So we punt on the best impl and go for
948  // a pretty good one: the permutation is done in place in
949  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
950  // is the presumably worse memory access pattern of the input vector.
951  need_owned_permutation = true;
952  }
953  }
954  }
955  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
956  }
957 
958  if (separate_remotes) {
959  IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
961  const auto parsimonious_col_map = need_owned_permutation ? Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())) : Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
962  if (parsimonious_col_map->getGlobalNumElements() > 0) {
963  // make the importer only if needed.
964  local_ordinal_type_1d_view dm2cm;
965  if (need_owned_permutation) {
966  dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
967  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
968  for (size_t i = 0; i < domain_map->getLocalNumElements(); ++i)
969  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
970  Kokkos::deep_copy(dm2cm, dm2cm_host);
971  }
972  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
973  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
974  }
975  }
976  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
977  return Teuchos::null;
978 }
979 
980 template <typename local_ordinal_type>
981 local_ordinal_type costTRSM(const local_ordinal_type block_size) {
982  return block_size * block_size;
983 }
984 
985 template <typename local_ordinal_type>
986 local_ordinal_type costGEMV(const local_ordinal_type block_size) {
987  return 2 * block_size * block_size;
988 }
989 
990 template <typename local_ordinal_type>
991 local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
992  return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length - 1) * costGEMV(block_size);
993 }
994 
995 template <typename local_ordinal_type>
996 local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
997  const local_ordinal_type num_teams,
998  const local_ordinal_type line_length,
999  const local_ordinal_type block_size,
1000  const local_ordinal_type n_subparts_per_part) {
1001  const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part - 1) * 2) / n_subparts_per_part);
1002  if (subline_length < 1) {
1003  return INT_MAX;
1004  }
1005 
1006  const local_ordinal_type p_n_lines = ceil(double(num_parts) / num_teams);
1007  const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part) * num_parts / num_teams);
1008  const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part - 1) * num_parts / num_teams);
1009 
1010  const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1011  const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part - 1) * 2, block_size);
1012  const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length, block_size);
1013  const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1014 
1015  if (n_subparts_per_part == 1) {
1016  return p_costApplyAinv;
1017  }
1018  return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1019 }
1020 
1021 template <typename local_ordinal_type>
1022 local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
1023  const local_ordinal_type num_teams,
1024  const local_ordinal_type line_length,
1025  const local_ordinal_type block_size) {
1026  local_ordinal_type n_subparts_per_part_0 = 1;
1027  local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
1028  local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0 + 1);
1029  while (flop_0 > flop_1) {
1030  flop_0 = flop_1;
1031  flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0) + 1);
1032  }
1033  return n_subparts_per_part_0;
1034 }
1035 
1036 template <typename ArgActiveExecutionMemorySpace>
1037 struct SolveTridiagsDefaultModeAndAlgo;
1038 
1042 template <typename MatrixType>
1043 BlockHelperDetails::PartInterface<MatrixType>
1044 createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1045  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1046  const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type>> &partitions,
1047  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1048  IFPACK2_BLOCKHELPER_TIMER("createPartInterface", createPartInterface);
1049  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1050  using local_ordinal_type = typename impl_type::local_ordinal_type;
1051  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1052  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1053  using size_type = typename impl_type::size_type;
1054 
1055  auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1056 
1057  TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1058  const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1059  constexpr int vector_length = impl_type::vector_length;
1060  constexpr int internal_vector_length = impl_type::internal_vector_length;
1061 
1062  const auto comm = A->getRowMap()->getComm();
1063 
1064  BlockHelperDetails::PartInterface<MatrixType> interf;
1065 
1066  const bool jacobi = partitions.size() == 0;
1067  const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1068  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1069 
1070  typedef std::pair<local_ordinal_type, local_ordinal_type> size_idx_pair_type;
1071  std::vector<size_idx_pair_type> partsz(nparts);
1072 
1073  if (!jacobi) {
1074  for (local_ordinal_type i = 0; i < nparts; ++i)
1075  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1076  std::sort(partsz.begin(), partsz.end(),
1077  [](const size_idx_pair_type &x, const size_idx_pair_type &y) {
1078  return x.first > y.first;
1079  });
1080  }
1081 
1082  local_ordinal_type n_subparts_per_part;
1083  if (n_subparts_per_part_in == -1) {
1084  // If the number of subparts is set to -1, the user let the algorithm
1085  // decides the value automatically
1086  using execution_space = typename impl_type::execution_space;
1087 
1088  const int line_length = partsz[0].first;
1089 
1090  const local_ordinal_type team_size =
1091  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1092  recommended_team_size(blocksize, vector_length, internal_vector_length);
1093 
1094  const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1095 
1096  n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1097 
1098 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1099  printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1100 #endif
1101  } else {
1102  n_subparts_per_part = n_subparts_per_part_in;
1103  }
1104 
1105  // Total number of sub lines:
1106  const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1107  // Total number of sub lines + the Schur complement blocks.
1108  // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1109  const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part - 1);
1110 
1111 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1112  local_ordinal_type nrows = 0;
1113  if (jacobi)
1114  nrows = nparts;
1115  else
1116  for (local_ordinal_type i = 0; i < nparts; ++i) nrows += partitions[i].size();
1117 
1118  TEUCHOS_TEST_FOR_EXCEPT_MSG(nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1119  << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1120 #endif
1121 
1122  // permutation vector
1123  std::vector<local_ordinal_type> p;
1124  if (jacobi) {
1125  interf.max_partsz = 1;
1126  interf.max_subpartsz = 0;
1127  interf.n_subparts_per_part = 1;
1128  interf.nparts = nparts;
1129  } else {
1130  // reorder parts to maximize simd packing efficiency
1131  p.resize(nparts);
1132 
1133  for (local_ordinal_type i = 0; i < nparts; ++i)
1134  p[i] = partsz[i].second;
1135 
1136  interf.max_partsz = partsz[0].first;
1137 
1138  constexpr local_ordinal_type connection_length = 2;
1139  const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1140  const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1141 
1142  interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1143  interf.n_subparts_per_part = n_subparts_per_part;
1144  interf.nparts = nparts;
1145  }
1146 
1147  // allocate parts
1148  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1149  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1150  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1151  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1152  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1153 
1154  interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1155  interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1156  interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1157 
1158  interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1159 
1160  // mirror to host and compute on host execution space
1161  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1162  const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1163 
1164  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1165  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1166  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1167  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1168 
1169  const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1170  const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1171  const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1172 
1173  // Determine parts.
1174  interf.row_contiguous = true;
1175  partptr(0) = 0;
1176  part2rowidx0(0) = 0;
1177  part2packrowidx0(0) = 0;
1178  local_ordinal_type pack_nrows = 0;
1179  local_ordinal_type pack_nrows_sub = 0;
1180  if (jacobi) {
1181  IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
1182  // Jacobi (all lines have length 1) means that A_n_lclrows == nparts,
1183  // so the mapping between parts and rows is trivial.
1184  // Note: we can leave interf.row_contiguous = true, since for all i: lclrow(i) == i
1185  for (local_ordinal_type i = 0; i <= nparts; ++i) {
1186  part2rowidx0(i) = i;
1187  partptr(i) = i;
1188  }
1189  for (local_ordinal_type i = 0; i < nparts; ++i) {
1190  rowidx2part(i) = i;
1191  lclrow(i) = i;
1192  }
1193  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1194  // assume No overlap.
1195  if (ip % vector_length == 0) pack_nrows = 1;
1196  part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1197  }
1198  part2rowidx0_sub(0) = 0;
1199  partptr_sub(0, 0) = 0;
1200 
1201  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1202  constexpr local_ordinal_type ipnrows = 1;
1203  const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1204 
1205  TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1206  "In the part " << ip);
1207 
1208  constexpr local_ordinal_type connection_length = 2;
1209 
1210  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1211  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1212  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1213 
1214  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1215  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1216 
1217  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1218 
1219  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1220  const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1221  const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1222  if (local_sub_ip != n_subparts_per_part - 1) {
1223  if (local_sub_ip != 0) {
1224  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1225  } else if (ip != 0) {
1226  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1227  }
1228  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1229  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1230  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1231 
1232  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1233  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1234 
1235 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1236  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1237  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1238 #endif
1239  } else {
1240  if (local_sub_ip != 0) {
1241  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1242  } else if (ip != 0) {
1243  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1244  }
1245  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1246 
1247  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1248 
1249 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1250  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1251 #endif
1252  }
1253  }
1254  }
1255 
1256 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1257  std::cout << "partptr_sub = " << std::endl;
1258  for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1259  for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1260  std::cout << partptr_sub(i, j) << " ";
1261  }
1262  std::cout << std::endl;
1263  }
1264  std::cout << "partptr_sub end" << std::endl;
1265 #endif
1266 
1267  {
1268  local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1269 
1270  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1271  for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1272  part2packrowidx0_sub(ip, 0) = 0;
1273  }
1274  for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1275  if (ipack != 0) {
1276  local_ordinal_type ip_min = ipack * vector_length;
1277  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1278  for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1279  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1280  }
1281  }
1282 
1283  for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1284  local_ordinal_type ip_min = ipack * vector_length;
1285  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1286 
1287  const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1288 
1289  constexpr local_ordinal_type connection_length = 2;
1290 
1291  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1292  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1293 
1294  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1295  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1296  if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1297 
1298  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1299 
1300  for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1301  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1302  }
1303  }
1304  }
1305 
1306  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1307  }
1308  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1309  } else {
1310  IFPACK2_BLOCKHELPER_TIMER("compute part indices", indices);
1311  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1312  const auto *part = &partitions[p[ip]];
1313  const local_ordinal_type ipnrows = part->size();
1314  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip - 1]].size())));
1315  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1316  BlockHelperDetails::get_msg_prefix(comm)
1317  << "partition " << p[ip]
1318  << " is empty, which is not allowed.");
1319  // assume No overlap.
1320  part2rowidx0(ip + 1) = part2rowidx0(ip) + ipnrows;
1321  // Since parts are ordered in decreasing size, the size of the first
1322  // part in a pack is the size for all parts in the pack.
1323  if (ip % vector_length == 0) pack_nrows = ipnrows;
1324  part2packrowidx0(ip + 1) = part2packrowidx0(ip) + ((ip + 1) % vector_length == 0 || ip + 1 == nparts ? pack_nrows : 0);
1325  const local_ordinal_type offset = partptr(ip);
1326  for (local_ordinal_type i = 0; i < ipnrows; ++i) {
1327  const auto lcl_row = (*part)[i];
1328  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1329  BlockHelperDetails::get_msg_prefix(comm)
1330  << "partitions[" << p[ip] << "]["
1331  << i << "] = " << lcl_row
1332  << " but input matrix implies limits of [0, " << A_n_lclrows - 1
1333  << "].");
1334  lclrow(offset + i) = lcl_row;
1335  rowidx2part(offset + i) = ip;
1336  if (interf.row_contiguous && offset + i > 0 && lclrow((offset + i) - 1) + 1 != lcl_row)
1337  interf.row_contiguous = false;
1338  }
1339  partptr(ip + 1) = offset + ipnrows;
1340 
1341 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1342  printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1343  printf("partptr(%d+1) = %d\n", ip, partptr(ip + 1));
1344 #endif
1345  }
1346 
1347  part2rowidx0_sub(0) = 0;
1348  partptr_sub(0, 0) = 0;
1349  // const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1350 
1351  for (local_ordinal_type ip = 0; ip < nparts; ++ip) {
1352  const auto *part = &partitions[p[ip]];
1353  const local_ordinal_type ipnrows = part->size();
1354  const local_ordinal_type full_line_length = partptr(ip + 1) - partptr(ip);
1355 
1356  TEUCHOS_TEST_FOR_EXCEPTION(full_line_length != ipnrows, std::logic_error,
1357  "In the part " << ip);
1358 
1359  constexpr local_ordinal_type connection_length = 2;
1360 
1361  if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length)
1362  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
1363  "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1364 
1365  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1366  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1367 
1368  if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1369 
1370  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part; ++local_sub_ip) {
1371  const local_ordinal_type sub_ip = nparts * (2 * local_sub_ip) + ip;
1372  const local_ordinal_type schur_ip = nparts * (2 * local_sub_ip + 1) + ip;
1373  if (local_sub_ip != n_subparts_per_part - 1) {
1374  if (local_sub_ip != 0) {
1375  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1376  } else if (ip != 0) {
1377  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1378  }
1379  partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1380  partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1381  partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1382 
1383  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1384  part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1385 
1386 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1387  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1388  printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1389 #endif
1390  } else {
1391  if (local_sub_ip != 0) {
1392  partptr_sub(sub_ip, 0) = partptr_sub(nparts * (2 * local_sub_ip - 1) + ip, 1);
1393  } else if (ip != 0) {
1394  partptr_sub(sub_ip, 0) = partptr_sub(nparts * 2 * (n_subparts_per_part - 1) + ip - 1, 1);
1395  }
1396  partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1397 
1398  part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1399 
1400 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1401  printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1402 #endif
1403  }
1404  }
1405  }
1406 
1407  {
1408  local_ordinal_type npacks = ceil(float(nparts) / vector_length);
1409 
1410  local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1411  for (local_ordinal_type ip = 0; ip < ip_max; ++ip) {
1412  part2packrowidx0_sub(ip, 0) = 0;
1413  }
1414  for (local_ordinal_type ipack = 0; ipack < npacks; ++ipack) {
1415  if (ipack != 0) {
1416  local_ordinal_type ip_min = ipack * vector_length;
1417  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1418  for (local_ordinal_type ip = ip_min; ip < ip_max; ++ip) {
1419  part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip - vector_length, part2packrowidx0_sub.extent(1) - 1);
1420  }
1421  }
1422 
1423  for (size_type local_sub_ip = 0; local_sub_ip < part2packrowidx0_sub.extent(1) - 1; ++local_sub_ip) {
1424  local_ordinal_type ip_min = ipack * vector_length;
1425  ip_max = nparts > (ipack + 1) * vector_length ? (ipack + 1) * vector_length : nparts;
1426 
1427  const local_ordinal_type full_line_length = partptr(ip_min + 1) - partptr(ip_min);
1428 
1429  constexpr local_ordinal_type connection_length = 2;
1430 
1431  const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1432  const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1433 
1434  if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1435  if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1436  if (local_sub_ip == part2packrowidx0_sub.extent(1) - 2) pack_nrows_sub = last_sub_line_length;
1437 
1438  part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1439 
1440  for (local_ordinal_type ip = ip_min + 1; ip < ip_max; ++ip) {
1441  part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1442  }
1443  }
1444  }
1445 
1446  Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1447  }
1448  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1449  }
1450 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1451  TEUCHOS_ASSERT(partptr(nparts) == nrows);
1452 #endif
1453  if (lclrow(0) != 0) interf.row_contiguous = false;
1454 
1455  Kokkos::deep_copy(interf.partptr, partptr);
1456  Kokkos::deep_copy(interf.lclrow, lclrow);
1457 
1458  Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1459 
1460  // assume No overlap. Thus:
1461  interf.part2rowidx0 = interf.partptr;
1462  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1463 
1464  interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1465  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1466 
1467  { // Fill packptr.
1468  IFPACK2_BLOCKHELPER_TIMER("Fill packptr", packptr0);
1469  local_ordinal_type npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1470  npacks = 0;
1471  for (local_ordinal_type ip = 1; ip <= nparts; ++ip) // n_sub_parts_and_schur
1472  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1473  ++npacks;
1474 
1475  interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1476  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1477  packptr(0) = 0;
1478  for (local_ordinal_type ip = 1, k = 1; ip <= nparts; ++ip)
1479  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1480  packptr(k++) = ip;
1481 
1482  Kokkos::deep_copy(interf.packptr, packptr);
1483 
1484  local_ordinal_type npacks_per_subpart = ceil(float(nparts) / vector_length);
1485  npacks = ceil(float(nparts) / vector_length) * (part2packrowidx0_sub.extent(1) - 1);
1486 
1487  interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart * n_subparts_per_part);
1488  interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart, n_subparts_per_part - 1);
1489 
1490  const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1491  const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1492 
1493  // Fill packindices_sub and packindices_schur
1494  for (local_ordinal_type local_sub_ip = 0; local_sub_ip < n_subparts_per_part - 1; ++local_sub_ip) {
1495  for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1496  packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1497  packindices_schur(local_pack_ip, local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1498  }
1499  }
1500 
1501  for (local_ordinal_type local_pack_ip = 0; local_pack_ip < npacks_per_subpart; ++local_pack_ip) {
1502  packindices_sub((n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part - 1) * npacks_per_subpart + local_pack_ip;
1503  }
1504 
1505 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1506  std::cout << "packindices_sub = " << std::endl;
1507  for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1508  std::cout << packindices_sub(i) << " ";
1509  }
1510  std::cout << std::endl;
1511  std::cout << "packindices_sub end" << std::endl;
1512 
1513  std::cout << "packindices_schur = " << std::endl;
1514  for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1515  for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1516  std::cout << packindices_schur(i, j) << " ";
1517  }
1518  std::cout << std::endl;
1519  }
1520 
1521  std::cout << "packindices_schur end" << std::endl;
1522 #endif
1523 
1524  Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1525  Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1526 
1527  interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1528  const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1529  packptr_sub(0) = 0;
1530  for (local_ordinal_type k = 0; k < npacks + 1; ++k)
1531  packptr_sub(k) = packptr(k % npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1532 
1533  Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1534  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1535  }
1536  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1537 
1538  return interf;
1539 }
1540 
1544 template <typename MatrixType>
1547  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1548  using size_type_1d_view = typename impl_type::size_type_1d_view;
1549  using size_type_2d_view = typename impl_type::size_type_2d_view;
1550  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1551  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1552  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1553 
1554  // flat_td_ptr(i) is the index into flat-array values of the start of the
1555  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1556  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1557  // vector_length is the position in the pack.
1558  size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1559  // List of local column indices into A from which to grab
1560  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1561  local_ordinal_type_1d_view A_colindsub;
1562  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1563  // tridiag's pack, and i % vector_length gives the position in the pack.
1564  vector_type_3d_view values;
1565  // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1566  // Schur's pack, and i % vector_length gives the position in the pack.
1567  vector_type_3d_view values_schur;
1568  // inv(A_00)*A_01 block values.
1569  vector_type_4d_view e_values;
1570 
1571  // The following are for fused block Jacobi only.
1572  // For block row i, diag_offset(i)...diag_offset(i + bs^2)
1573  // is the range of scalars for the diagonal block.
1574  size_type_1d_view diag_offsets;
1575  // For fused residual+solve block Jacobi case,
1576  // this contains the diagonal block inverses in flat, local row indexing:
1577  // d_inv(row, :, :) gives the row-major block for row.
1578  btdm_scalar_type_3d_view d_inv;
1579 
1580  bool is_diagonal_only;
1581 
1582  BlockTridiags() = default;
1583  BlockTridiags(const BlockTridiags &b) = default;
1584 
1585  // Index into row-major block of a tridiag.
1586  template <typename idx_type>
1587  static KOKKOS_FORCEINLINE_FUNCTION
1588  idx_type
1589  IndexToRow(const idx_type &ind) { return (ind + 1) / 3; }
1590  // Given a row of a row-major tridiag, return the index of the first block
1591  // in that row.
1592  template <typename idx_type>
1593  static KOKKOS_FORCEINLINE_FUNCTION
1594  idx_type
1595  RowToIndex(const idx_type &row) { return row > 0 ? 3 * row - 1 : 0; }
1596  // Number of blocks in a tridiag having a given number of rows.
1597  template <typename idx_type>
1598  static KOKKOS_FORCEINLINE_FUNCTION
1599  idx_type
1600  NumBlocks(const idx_type &nrows) { return nrows > 0 ? 3 * nrows - 2 : 0; }
1601  // Number of blocks associated to a Schur complement having a given number of rows.
1602  template <typename idx_type>
1603  static KOKKOS_FORCEINLINE_FUNCTION
1604  idx_type
1605  NumBlocksSchur(const idx_type &nrows) { return nrows > 0 ? 3 * nrows + 2 : 0; }
1606 };
1607 
1611 template <typename MatrixType>
1613 createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1614  IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags", createBlockTridiags0);
1615  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1616  using execution_space = typename impl_type::execution_space;
1617  using local_ordinal_type = typename impl_type::local_ordinal_type;
1618  using size_type = typename impl_type::size_type;
1619  using size_type_2d_view = typename impl_type::size_type_2d_view;
1620 
1621  constexpr int vector_length = impl_type::vector_length;
1622 
1624 
1625  const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1626 
1627  { // construct the flat index pointers into the tridiag values array.
1628  btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1629  const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part);
1630  Kokkos::parallel_scan(
1631  "createBlockTridiags::RangePolicy::flat_td_ptr",
1632  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1633  const local_ordinal_type partidx = i / (2 * interf.n_subparts_per_part);
1634  const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1635 
1636  if (final) {
1637  btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1638  }
1639  if (local_subpartidx != (2 * interf.n_subparts_per_part - 1)) {
1640  const local_ordinal_type nrows = interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 1) - interf.partptr_sub(interf.nparts * local_subpartidx + partidx, 0);
1641  if (local_subpartidx % 2 == 0)
1642  update += btdm.NumBlocks(nrows);
1643  else
1644  update += btdm.NumBlocksSchur(nrows);
1645  }
1646  });
1647 
1648  const auto nblocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts - 1, 2 * interf.n_subparts_per_part - 1));
1649  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1650  }
1651 
1652  // And the packed index pointers.
1653  if (vector_length == 1) {
1654  btdm.pack_td_ptr = btdm.flat_td_ptr;
1655  } else {
1656  // const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1657 
1658  local_ordinal_type npacks_per_subpart = 0;
1659  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1660  Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1661  for (local_ordinal_type ip = 1; ip <= interf.nparts; ++ip) // n_sub_parts_and_schur
1662  if (part2packrowidx0(ip) != part2packrowidx0(ip - 1))
1663  ++npacks_per_subpart;
1664 
1665  btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2 * interf.n_subparts_per_part);
1666  const Kokkos::RangePolicy<execution_space> policy(0, npacks_per_subpart);
1667 
1668  Kokkos::parallel_for(
1669  "createBlockTridiags::RangePolicy::pack_td_ptr",
1670  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1671  for (local_ordinal_type j = 0; j < 2 * interf.n_subparts_per_part; ++j) {
1672  const local_ordinal_type pack_id = (j == 2 * interf.n_subparts_per_part - 1) ? i + (j - 1) * npacks_per_subpart : i + j * npacks_per_subpart;
1673  const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id + 1) - interf.packptr_sub(pack_id);
1674 
1675  const local_ordinal_type parti = interf.packptr_sub(pack_id);
1676  const local_ordinal_type partidx = parti % interf.nparts;
1677 
1678  for (local_ordinal_type pti = 0; pti < nparts_in_pack; ++pti) {
1679  btdm.pack_td_ptr(partidx + pti, j) = btdm.flat_td_ptr(i, j);
1680  }
1681  }
1682  });
1683  }
1684 
1685  btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1686 
1687  const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1688  constexpr local_ordinal_type connection_length = 2;
1689 
1690  host_pack_td_ptr_schur(0, 0) = 0;
1691  for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1692  if (i % vector_length == 0) {
1693  if (i != 0)
1694  host_pack_td_ptr_schur(i, 0) = host_pack_td_ptr_schur(i - 1, host_pack_td_ptr_schur.extent(1) - 1);
1695  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part - 1; ++j) {
1696  host_pack_td_ptr_schur(i, j + 1) = host_pack_td_ptr_schur(i, j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part - 2 ? 1 : 0);
1697  }
1698  } else {
1699  for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1700  host_pack_td_ptr_schur(i, j) = host_pack_td_ptr_schur(i - 1, j);
1701  }
1702  }
1703  }
1704 
1705  Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1706 
1707 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1708  const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1709  std::cout << "flat_td_ptr = " << std::endl;
1710  for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1711  for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1712  std::cout << host_flat_td_ptr(i, j) << " ";
1713  }
1714  std::cout << std::endl;
1715  }
1716  std::cout << "flat_td_ptr end" << std::endl;
1717 
1718  const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1719 
1720  std::cout << "pack_td_ptr = " << std::endl;
1721  for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1722  for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1723  std::cout << host_pack_td_ptr(i, j) << " ";
1724  }
1725  std::cout << std::endl;
1726  }
1727  std::cout << "pack_td_ptr end" << std::endl;
1728 
1729  std::cout << "pack_td_ptr_schur = " << std::endl;
1730  for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1731  for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1732  std::cout << host_pack_td_ptr_schur(i, j) << " ";
1733  }
1734  std::cout << std::endl;
1735  }
1736  std::cout << "pack_td_ptr_schur end" << std::endl;
1737 #endif
1738 
1739  // values and A_colindsub are created in the symbolic phase
1740  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1741 
1742  return btdm;
1743 }
1744 
1745 // Set the tridiags to be I to the full pack block size. That way, if a
1746 // tridiag within a pack is shorter than the longest one, the extra blocks are
1747 // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1748 // in the packed multvector are 0, and the tridiag LU reflects the extra I
1749 // blocks, then the solve proceeds as though the extra blocks aren't
1750 // present. Since this extra work is part of the SIMD calls, it's not actually
1751 // extra work. Instead, it means we don't have to put checks or masks in, or
1752 // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1753 // since the numeric phase fills in only the used entries, leaving these I
1754 // blocks intact.
1755 template <typename MatrixType>
1756 void setTridiagsToIdentity(const BlockTridiags<MatrixType> &btdm,
1757  const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1758  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1759  using execution_space = typename impl_type::execution_space;
1760  using local_ordinal_type = typename impl_type::local_ordinal_type;
1761  using size_type_2d_view = typename impl_type::size_type_2d_view;
1762 
1763  const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1764  const local_ordinal_type blocksize = btdm.values.extent(1);
1765 
1766  {
1767  const int vector_length = impl_type::vector_length;
1768  const int internal_vector_length = impl_type::internal_vector_length;
1769 
1770  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1771  using internal_vector_type = typename impl_type::internal_vector_type;
1772  using internal_vector_type_4d_view =
1773  typename impl_type::internal_vector_type_4d_view;
1774 
1775  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1776  const internal_vector_type_4d_view values(reinterpret_cast<internal_vector_type *>(btdm.values.data()),
1777  btdm.values.extent(0),
1778  btdm.values.extent(1),
1779  btdm.values.extent(2),
1780  vector_length / internal_vector_length);
1781  const local_ordinal_type vector_loop_size = values.extent(3);
1782 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1783  local_ordinal_type total_team_size(0);
1784  if (blocksize <= 5)
1785  total_team_size = 32;
1786  else if (blocksize <= 9)
1787  total_team_size = 64;
1788  else if (blocksize <= 12)
1789  total_team_size = 96;
1790  else if (blocksize <= 16)
1791  total_team_size = 128;
1792  else if (blocksize <= 20)
1793  total_team_size = 160;
1794  else
1795  total_team_size = 160;
1796  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1797  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1798 #elif defined(KOKKOS_ENABLE_HIP)
1799  // FIXME: HIP
1800  // These settings might be completely wrong
1801  // will have to do some experiments to decide
1802  // what makes sense on AMD GPUs
1803  local_ordinal_type total_team_size(0);
1804  if (blocksize <= 5)
1805  total_team_size = 32;
1806  else if (blocksize <= 9)
1807  total_team_size = 64;
1808  else if (blocksize <= 12)
1809  total_team_size = 96;
1810  else if (blocksize <= 16)
1811  total_team_size = 128;
1812  else if (blocksize <= 20)
1813  total_team_size = 160;
1814  else
1815  total_team_size = 160;
1816  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1817  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1818 #elif defined(KOKKOS_ENABLE_SYCL)
1819  // SYCL: FIXME
1820  local_ordinal_type total_team_size(0);
1821  if (blocksize <= 5)
1822  total_team_size = 32;
1823  else if (blocksize <= 9)
1824  total_team_size = 64;
1825  else if (blocksize <= 12)
1826  total_team_size = 96;
1827  else if (blocksize <= 16)
1828  total_team_size = 128;
1829  else if (blocksize <= 20)
1830  total_team_size = 160;
1831  else
1832  total_team_size = 160;
1833  const local_ordinal_type team_size = total_team_size / vector_loop_size;
1834  const team_policy_type policy(packptr.extent(0) - 1, team_size, vector_loop_size);
1835 #else
1836  // Host architecture: team size is always one
1837  const team_policy_type policy(packptr.extent(0) - 1, 1, 1);
1838 #endif
1839  Kokkos::parallel_for(
1840  "setTridiagsToIdentity::TeamPolicy",
1841  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1842  const local_ordinal_type k = member.league_rank();
1843  const local_ordinal_type ibeg = pack_td_ptr(packptr(k), 0);
1844  const local_ordinal_type iend = pack_td_ptr(packptr(k), pack_td_ptr.extent(1) - 1);
1845 
1846  const local_ordinal_type diff = iend - ibeg;
1847  const local_ordinal_type icount = diff / 3 + (diff % 3 > 0);
1848  const btdm_scalar_type one(1);
1849  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
1850  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, icount), [&](const local_ordinal_type &ii) {
1851  const local_ordinal_type i = ibeg + ii * 3;
1852  for (local_ordinal_type j = 0; j < blocksize; ++j) {
1853  values(i, j, j, v) = one;
1854  }
1855  });
1856  });
1857  });
1858  }
1859 }
1860 
1864 template <typename MatrixType>
1865 void performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1866  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1867  const BlockHelperDetails::PartInterface<MatrixType> &interf,
1870  const bool overlap_communication_and_computation,
1871  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
1872  bool useSeqMethod,
1873  bool use_fused_jacobi) {
1874  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
1875 
1876  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
1877 
1878  using execution_space = typename impl_type::execution_space;
1879  using host_execution_space = typename impl_type::host_execution_space;
1880 
1881  using local_ordinal_type = typename impl_type::local_ordinal_type;
1882  using global_ordinal_type = typename impl_type::global_ordinal_type;
1883  using size_type = typename impl_type::size_type;
1884  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1885  using size_type_1d_view = typename impl_type::size_type_1d_view;
1886  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1887  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1888  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
1889  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1890  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
1891 
1892  constexpr int vector_length = impl_type::vector_length;
1893 
1894  const auto comm = A->getRowMap()->getComm();
1895 
1896  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1897  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1898 
1899  bool hasBlockCrsMatrix = !A_bcrs.is_null();
1900  TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1901  const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows() / g->getLocalNumRows();
1902 
1903  // mirroring to host
1904  const auto partptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.partptr);
1905  const auto lclrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.lclrow);
1906  const auto rowidx2part = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.rowidx2part);
1907  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1908  const auto packptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.packptr);
1909 
1910  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1911 
1912  Kokkos::View<local_ordinal_type *, host_execution_space> col2row("col2row", A->getLocalNumCols());
1913 
1914  // find column to row map on host
1915 
1916  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1917  {
1918  const auto rowmap = g->getRowMap();
1919  const auto colmap = g->getColMap();
1920  const auto dommap = g->getDomainMap();
1921  TEUCHOS_ASSERT(!(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1922 
1923 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1924  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
1925  Kokkos::parallel_for(
1926  "performSymbolicPhase::RangePolicy::col2row",
1927  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1928  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1930  if (dommap->isNodeGlobalElement(gid)) {
1931  const local_ordinal_type lc = colmap->getLocalElement(gid);
1932 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1934  BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
1935  << " gives an invalid local column.");
1936 #endif
1937  col2row(lc) = lr;
1938  }
1939  });
1940 #endif
1941  }
1942 
1943  // construct the D and R graphs in A = D + R.
1944  {
1945  const auto local_graph = g->getLocalGraphHost();
1946  const auto local_graph_rowptr = local_graph.row_map;
1947  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1948  const auto local_graph_colidx = local_graph.entries;
1949 
1950  // assume no overlap.
1951 
1952  Kokkos::View<local_ordinal_type *, host_execution_space> lclrow2idx("lclrow2idx", nrows);
1953  {
1954  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
1955  Kokkos::parallel_for(
1956  "performSymbolicPhase::RangePolicy::lclrow2idx",
1957  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1958  lclrow2idx[lclrow(i)] = i;
1959  });
1960  }
1961 
1962  // count (block) nnzs in D and R.
1964  typename sum_reducer_type::value_type sum_reducer_value;
1965  {
1966  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
1967  Kokkos::parallel_reduce
1968  // profiling interface does not work
1969  ( //"performSymbolicPhase::RangePolicy::count_nnz",
1970  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1971  // LID -> index.
1972  const local_ordinal_type ri0 = lclrow2idx[lr];
1973  const local_ordinal_type pi0 = rowidx2part(ri0);
1974  for (size_type j = local_graph_rowptr(lr); j < local_graph_rowptr(lr + 1); ++j) {
1975  const local_ordinal_type lc = local_graph_colidx(j);
1976  const local_ordinal_type lc2r = col2row[lc];
1977  bool incr_R = false;
1978  do { // breakable
1979  if (lc2r == (local_ordinal_type)-1) {
1980  incr_R = true;
1981  break;
1982  }
1983  const local_ordinal_type ri = lclrow2idx[lc2r];
1984  const local_ordinal_type pi = rowidx2part(ri);
1985  if (pi != pi0) {
1986  incr_R = true;
1987  break;
1988  }
1989  // Test for being in the tridiag. This is done in index space. In
1990  // LID space, tridiag LIDs in a row are not necessarily related by
1991  // {-1, 0, 1}.
1992  if (ri0 + 1 >= ri && ri0 <= ri + 1)
1993  ++update.v[0]; // D_nnz
1994  else
1995  incr_R = true;
1996  } while (0);
1997  if (incr_R) {
1998  if (lc < nrows)
1999  ++update.v[1]; // R_nnz_owned
2000  else
2001  ++update.v[2]; // R_nnz_remote
2002  }
2003  }
2004  },
2005  sum_reducer_type(sum_reducer_value));
2006  }
2007  size_type D_nnz = sum_reducer_value.v[0];
2008  size_type R_nnz_owned = sum_reducer_value.v[1];
2009  size_type R_nnz_remote = sum_reducer_value.v[2];
2010 
2011  if (!overlap_communication_and_computation) {
2012  R_nnz_owned += R_nnz_remote;
2013  R_nnz_remote = 0;
2014  }
2015 
2016  // construct the D_00 graph.
2017  {
2018  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2019 
2020  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
2021  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2022 
2023 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2024  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2025 #endif
2026 
2027  const local_ordinal_type nparts = partptr.extent(0) - 1;
2028 
2029  {
2030  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2031  Kokkos::parallel_for(
2032  "performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2033  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
2034  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2035  local_ordinal_type offset = 0;
2036  for (local_ordinal_type ri0 = partptr(pi0); ri0 < partptr(pi0 + 1); ++ri0) {
2037  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2038  offset = 1;
2039  const local_ordinal_type lr0 = lclrow(ri0);
2040  const size_type j0 = local_graph_rowptr(lr0);
2041  for (size_type j = j0; j < local_graph_rowptr(lr0 + 1); ++j) {
2042  const local_ordinal_type lc = local_graph_colidx(j);
2043  const local_ordinal_type lc2r = col2row[lc];
2044  if (lc2r == (local_ordinal_type)-1) continue;
2045  const local_ordinal_type ri = lclrow2idx[lc2r];
2046  const local_ordinal_type pi = rowidx2part(ri);
2047  if (pi != pi0) continue;
2048  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
2049  const local_ordinal_type row_entry = j - j0;
2050  D_A_colindsub(flat_td_ptr(pi0, 0) + ((td_row_os + ri) - ri0)) = row_entry;
2051  }
2052  }
2053  });
2054  }
2055 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2056  for (size_t i = 0; i < D_A_colindsub.extent(0); ++i)
2058 #endif
2059  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2060 
2061  // Allocate values.
2062  {
2063  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0) - 1, btdm.pack_td_ptr.extent(1) - 1);
2064  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2065  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2066 
2067  if (interf.n_subparts_per_part > 1) {
2068  const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0) - 1, btdm.pack_td_ptr_schur.extent(1) - 1);
2069  const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2070  btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2071  }
2072 
2073  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2074  }
2075  }
2076 
2077  // Construct the R graph.
2078  {
2079  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2080  amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2081 
2082  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2083  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2084 
2085  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2086  amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2087 
2088  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2089  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2090 
2091  {
2092  const Kokkos::RangePolicy<host_execution_space> policy(0, nrows);
2093  Kokkos::parallel_for(
2094  "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2095  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2096  const local_ordinal_type ri0 = lclrow2idx[lr];
2097  const local_ordinal_type pi0 = rowidx2part(ri0);
2098  const size_type j0 = local_graph_rowptr(lr);
2099  for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2100  const local_ordinal_type lc = local_graph_colidx(j);
2101  const local_ordinal_type lc2r = col2row[lc];
2102  if (lc2r != (local_ordinal_type)-1) {
2103  const local_ordinal_type ri = lclrow2idx[lc2r];
2104  const local_ordinal_type pi = rowidx2part(ri);
2105  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2106  continue;
2107  }
2108  }
2109  // exclusive scan will be performed later
2110  if (!overlap_communication_and_computation || lc < nrows) {
2111  ++R_rowptr(lr);
2112  } else {
2113  ++R_rowptr_remote(lr);
2114  }
2115  }
2116  });
2117  }
2118 
2119  // exclusive scan
2121  {
2122  Kokkos::RangePolicy<host_execution_space> policy(0, nrows + 1);
2123  Kokkos::parallel_scan(
2124  "performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2125  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, update_type &update, const bool &final) {
2126  update_type val;
2127  val.v[0] = R_rowptr(lr);
2128  if (overlap_communication_and_computation)
2129  val.v[1] = R_rowptr_remote(lr);
2130 
2131  if (final) {
2132  R_rowptr(lr) = update.v[0];
2133  if (overlap_communication_and_computation)
2134  R_rowptr_remote(lr) = update.v[1];
2135 
2136  if (lr < nrows) {
2137  const local_ordinal_type ri0 = lclrow2idx[lr];
2138  const local_ordinal_type pi0 = rowidx2part(ri0);
2139 
2140  size_type cnt_rowptr = R_rowptr(lr);
2141  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2142 
2143  const size_type j0 = local_graph_rowptr(lr);
2144  for (size_type j = j0; j < local_graph_rowptr(lr + 1); ++j) {
2145  const local_ordinal_type lc = local_graph_colidx(j);
2146  const local_ordinal_type lc2r = col2row[lc];
2147  if (lc2r != (local_ordinal_type)-1) {
2148  const local_ordinal_type ri = lclrow2idx[lc2r];
2149  const local_ordinal_type pi = rowidx2part(ri);
2150  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2151  continue;
2152  }
2153  const local_ordinal_type row_entry = j - j0;
2154  if (!overlap_communication_and_computation || lc < nrows)
2155  R_A_colindsub(cnt_rowptr++) = row_entry;
2156  else
2157  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2158  }
2159  }
2160  }
2161  update += val;
2162  });
2163  }
2164  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
2165  Kokkos::deep_copy(amd.rowptr, R_rowptr);
2166  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2167  if (overlap_communication_and_computation) {
2168  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
2169  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2170  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2171  }
2172 
2173  // Allocate or view values.
2174  if (hasBlockCrsMatrix)
2175  amd.tpetra_values = (const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst());
2176  else {
2177  amd.tpetra_values = (const_cast<crs_matrix_type *>(A_crs.get()))->getLocalValuesDevice(Tpetra::Access::ReadWrite);
2178  }
2179  }
2180 
2181  // Allocate view for E and initialize the values with B:
2182 
2183  if (interf.n_subparts_per_part > 1)
2184  btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2185  }
2186  // Precompute offsets of each A and x entry to speed up residual.
2187  // Applies if all of these are true:
2188  // - hasBlockCrsMatrix
2189  // - execution_space is a GPU
2190  // - !useSeqMethod (since this uses a different scheme for indexing A,x)
2191  //
2192  // Reading A, x take up to 4 and 6 levels of indirection respectively,
2193  // but precomputing the offsets reduces it to 2 for both (get index, then value)
2194  if (BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix) {
2195  bool is_async_importer_active = !async_importer.is_null();
2196  local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2197  bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2198  BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2199  }
2200 
2201  // If using fused block Jacobi path, allocate diagonal inverses here (d_inv) and find diagonal offsets.
2202  if (use_fused_jacobi) {
2203  btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag("btdm.d_inv"), interf.nparts, blocksize, blocksize);
2204  auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2205  auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2206  btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2207  }
2208  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2209 }
2210 
2214 template <typename ArgActiveExecutionMemorySpace>
2216 
2217 template <>
2218 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2219  typedef KB::Mode::Serial mode_type;
2220 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2221  typedef KB::Algo::Level3::CompactMKL algo_type;
2222 #else
2223  typedef KB::Algo::Level3::Blocked algo_type;
2224 #endif
2225  static int recommended_team_size(const int /* blksize */,
2226  const int /* vector_length */,
2227  const int /* internal_vector_length */) {
2228  return 1;
2229  }
2230 };
2231 
2232 #if defined(KOKKOS_ENABLE_CUDA)
2233 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2234  const int vector_length,
2235  const int internal_vector_length) {
2236  const int vector_size = vector_length / internal_vector_length;
2237  int total_team_size(0);
2238  if (blksize <= 5)
2239  total_team_size = 32;
2240  else if (blksize <= 9)
2241  total_team_size = 32; // 64
2242  else if (blksize <= 12)
2243  total_team_size = 96;
2244  else if (blksize <= 16)
2245  total_team_size = 128;
2246  else if (blksize <= 20)
2247  total_team_size = 160;
2248  else
2249  total_team_size = 160;
2250  return 2 * total_team_size / vector_size;
2251 }
2252 template <>
2253 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2254  typedef KB::Mode::Team mode_type;
2255  typedef KB::Algo::Level3::Unblocked algo_type;
2256  static int recommended_team_size(const int blksize,
2257  const int vector_length,
2258  const int internal_vector_length) {
2259  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2260  }
2261 };
2262 template <>
2263 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2264  typedef KB::Mode::Team mode_type;
2265  typedef KB::Algo::Level3::Unblocked algo_type;
2266  static int recommended_team_size(const int blksize,
2267  const int vector_length,
2268  const int internal_vector_length) {
2269  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2270  }
2271 };
2272 #endif
2273 
2274 #if defined(KOKKOS_ENABLE_HIP)
2275 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2276  const int vector_length,
2277  const int internal_vector_length) {
2278  const int vector_size = vector_length / internal_vector_length;
2279  int total_team_size(0);
2280  if (blksize <= 5)
2281  total_team_size = 32;
2282  else if (blksize <= 9)
2283  total_team_size = 32; // 64
2284  else if (blksize <= 12)
2285  total_team_size = 96;
2286  else if (blksize <= 16)
2287  total_team_size = 128;
2288  else if (blksize <= 20)
2289  total_team_size = 160;
2290  else
2291  total_team_size = 160;
2292  return 2 * total_team_size / vector_size;
2293 }
2294 template <>
2295 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2296  typedef KB::Mode::Team mode_type;
2297  typedef KB::Algo::Level3::Unblocked algo_type;
2298  static int recommended_team_size(const int blksize,
2299  const int vector_length,
2300  const int internal_vector_length) {
2301  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2302  }
2303 };
2304 template <>
2305 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2306  typedef KB::Mode::Team mode_type;
2307  typedef KB::Algo::Level3::Unblocked algo_type;
2308  static int recommended_team_size(const int blksize,
2309  const int vector_length,
2310  const int internal_vector_length) {
2311  return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2312  }
2313 };
2314 #endif
2315 
2316 #if defined(KOKKOS_ENABLE_SYCL)
2317 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2318  const int vector_length,
2319  const int internal_vector_length) {
2320  const int vector_size = vector_length / internal_vector_length;
2321  int total_team_size(0);
2322  if (blksize <= 5)
2323  total_team_size = 32;
2324  else if (blksize <= 9)
2325  total_team_size = 32; // 64
2326  else if (blksize <= 12)
2327  total_team_size = 96;
2328  else if (blksize <= 16)
2329  total_team_size = 128;
2330  else if (blksize <= 20)
2331  total_team_size = 160;
2332  else
2333  total_team_size = 160;
2334  return 2 * total_team_size / vector_size;
2335 }
2336 template <>
2337 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2338  typedef KB::Mode::Team mode_type;
2339  typedef KB::Algo::Level3::Unblocked algo_type;
2340  static int recommended_team_size(const int blksize,
2341  const int vector_length,
2342  const int internal_vector_length) {
2343  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2344  }
2345 };
2346 template <>
2347 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2348  typedef KB::Mode::Team mode_type;
2349  typedef KB::Algo::Level3::Unblocked algo_type;
2350  static int recommended_team_size(const int blksize,
2351  const int vector_length,
2352  const int internal_vector_length) {
2353  return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2354  }
2355 };
2356 #endif
2357 
2358 template <typename impl_type, typename WWViewType>
2359 KOKKOS_INLINE_FUNCTION void
2360 solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2361  const typename impl_type::local_ordinal_type & /* blocksize */,
2362  const typename impl_type::local_ordinal_type &i0,
2363  const typename impl_type::local_ordinal_type &r0,
2364  const typename impl_type::local_ordinal_type &nrows,
2365  const typename impl_type::local_ordinal_type &v,
2366  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2367  const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2368  const WWViewType &WW,
2369  const bool skip_first_pass = false) {
2370  using execution_space = typename impl_type::execution_space;
2371  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2372  using member_type = typename team_policy_type::member_type;
2373  using local_ordinal_type = typename impl_type::local_ordinal_type;
2374 
2375  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2376 
2377  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2378  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2379 
2380  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2381 
2382  // constant
2383  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2384  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2385 
2386  // subview pattern
2387  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2388  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2389  auto X2 = X1;
2390 
2391  local_ordinal_type i = i0, r = r0;
2392 
2393  if (nrows > 1) {
2394  // solve Lx = x
2395  if (skip_first_pass) {
2396  i += (nrows - 2) * 3;
2397  r += (nrows - 2);
2398  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2399  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2400  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2401  KB::Trsm<member_type,
2402  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2403  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2404  X1.assign_data(X2.data());
2405  i += 3;
2406  } else {
2407  KB::Trsm<member_type,
2408  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2409  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2410  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
2411  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
2412  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
2413  member.team_barrier();
2414  KB::Gemm<member_type,
2415  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2416  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2417  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
2418  KB::Trsm<member_type,
2419  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
2420  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2421  X1.assign_data(X2.data());
2422  }
2423  }
2424 
2425  // solve Ux = x
2426  KB::Trsm<member_type,
2427  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2428  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
2429  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2430  i -= 3;
2431  A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
2432  X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
2433  member.team_barrier();
2434  KB::Gemm<member_type,
2435  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2436  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
2437 
2438  A.assign_data(&D_internal_vector_values(i, 0, 0, v));
2439  KB::Trsm<member_type,
2440  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
2441  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
2442  X1.assign_data(X2.data());
2443  }
2444  } else {
2445  // matrix is already inverted
2446  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2447  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
2448  member.team_barrier();
2449  KB::Gemm<member_type,
2450  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
2451  default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
2452  }
2453 }
2454 
2455 template <typename impl_type, typename WWViewType, typename XViewType>
2456 KOKKOS_INLINE_FUNCTION void
2457 solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2458  const typename impl_type::local_ordinal_type &blocksize,
2459  const typename impl_type::local_ordinal_type &i0,
2460  const typename impl_type::local_ordinal_type &r0,
2461  const typename impl_type::local_ordinal_type &nrows,
2462  const typename impl_type::local_ordinal_type &v,
2463  const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2464  const XViewType &X_internal_vector_values, // Unmanaged<typename impl_type::internal_vector_type_4d_view>
2465  const WWViewType &WW) {
2466  using execution_space = typename impl_type::execution_space;
2467  // using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2468  // using member_type = typename team_policy_type::member_type;
2469  using local_ordinal_type = typename impl_type::local_ordinal_type;
2470 
2471  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
2472 
2473  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2474  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2475 
2476  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2477 
2478  // base pointers
2479  auto A = D_internal_vector_values.data();
2480  auto X = X_internal_vector_values.data();
2481 
2482  // constant
2483  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2484  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2485  // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2486 
2487  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2488  const local_ordinal_type astep = D_internal_vector_values.stride_0();
2489  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); // blocksize*vector_length;
2490  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); // vector_length;
2491  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2492  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); // vector_length;
2493 
2494  // move to starting point
2495  A += i0 * astep + v;
2496  X += r0 * xstep + v;
2497 
2498  // for (local_ordinal_type col=0;col<num_vectors;++col)
2499  if (nrows > 1) {
2500  // solve Lx = x
2501  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2502  member,
2503  KB::Diag::Unit,
2504  blocksize, blocksize,
2505  one,
2506  A, as0, as1,
2507  X, xs0);
2508 
2509  for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
2510  member.team_barrier();
2511  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2512  member,
2513  blocksize, blocksize,
2514  -one,
2515  A + 2 * astep, as0, as1,
2516  X, xs0,
2517  one,
2518  X + 1 * xstep, xs0);
2519  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2520  member,
2521  KB::Diag::Unit,
2522  blocksize, blocksize,
2523  one,
2524  A + 3 * astep, as0, as1,
2525  X + 1 * xstep, xs0);
2526 
2527  A += 3 * astep;
2528  X += 1 * xstep;
2529  }
2530 
2531  // solve Ux = x
2532  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2533  member,
2534  KB::Diag::NonUnit,
2535  blocksize, blocksize,
2536  one,
2537  A, as0, as1,
2538  X, xs0);
2539 
2540  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
2541  A -= 3 * astep;
2542  member.team_barrier();
2543  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2544  member,
2545  blocksize, blocksize,
2546  -one,
2547  A + 1 * astep, as0, as1,
2548  X, xs0,
2549  one,
2550  X - 1 * xstep, xs0);
2551  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2552  member,
2553  KB::Diag::NonUnit,
2554  blocksize, blocksize,
2555  one,
2556  A, as0, as1,
2557  X - 1 * xstep, xs0);
2558  X -= 1 * xstep;
2559  }
2560  // for multiple rhs
2561  // X += xs1;
2562  } else {
2563  const local_ordinal_type ws0 = WW.stride_0();
2564  auto W = WW.data() + v;
2565  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
2566  member, blocksize, X, xs0, W, ws0);
2567  member.team_barrier();
2568  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
2569  member,
2570  blocksize, blocksize,
2571  one,
2572  A, as0, as1,
2573  W, xs0,
2574  zero,
2575  X, xs0);
2576  }
2577 }
2578 
2579 template <typename local_ordinal_type, typename ViewType>
2580 void writeBTDValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2581 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2582  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2583  std::ofstream myfile;
2584  myfile.open(fileName);
2585 
2586  const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type)scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2587  local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2588  const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2589  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2590 
2591  const local_ordinal_type block_size = scalar_values.extent(1);
2592 
2593  const local_ordinal_type n_rows_per_part = (n_blocks_per_part + 2) / 3 * block_size;
2594  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2595 
2596  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2597 
2598  myfile << "%%MatrixMarket matrix coordinate real general" << std::endl;
2599  myfile << "%%nnz = " << nnz;
2600  myfile << " block size = " << block_size;
2601  myfile << " number of blocks = " << n_blocks;
2602  myfile << " number of parts = " << n_parts;
2603  myfile << " number of blocks per part = " << n_blocks_per_part;
2604  myfile << " number of rows = " << n_rows;
2605  myfile << " number of cols = " << n_rows;
2606  myfile << " number of packs = " << n_packs << std::endl;
2607 
2608  myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2609 
2610  local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2611  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2612  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2613  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2614  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2615  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2616  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2617  continue;
2618  if (i_block_in_part % 3 == 0) {
2619  current_row_offset = i_block_in_part / 3 * block_size;
2620  current_col_offset = i_block_in_part / 3 * block_size;
2621  } else if (i_block_in_part % 3 == 1) {
2622  current_row_offset = (i_block_in_part - 1) / 3 * block_size;
2623  current_col_offset = ((i_block_in_part - 1) / 3 + 1) * block_size;
2624  } else if (i_block_in_part % 3 == 2) {
2625  current_row_offset = ((i_block_in_part - 2) / 3 + 1) * block_size;
2626  current_col_offset = (i_block_in_part - 2) / 3 * block_size;
2627  }
2628  current_row_offset += current_part_idx * n_rows_per_part;
2629  current_col_offset += current_part_idx * n_rows_per_part;
2630  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2631  for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2632  current_row = current_row_offset + i_in_block + 1;
2633  current_col = current_col_offset + j_in_block + 1;
2634  myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2635  }
2636  }
2637  }
2638  }
2639  }
2640 
2641  myfile.close();
2642 #endif
2643 }
2644 
2645 template <typename local_ordinal_type, typename ViewType>
2646 void write4DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2647 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2648  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2649  std::ofstream myfile;
2650  myfile.open(fileName);
2651 
2652  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2653  const local_ordinal_type n_blocks = scalar_values.extent(0) * n_parts_per_pack;
2654  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2655 
2656  const local_ordinal_type block_size = scalar_values.extent(1);
2657  const local_ordinal_type n_cols = scalar_values.extent(2);
2658 
2659  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2660  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2661 
2662  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2663 
2664  myfile << "%%MatrixMarket matrix array real general" << std::endl;
2665  myfile << "%%block size = " << block_size;
2666  myfile << " number of blocks = " << n_blocks;
2667  myfile << " number of parts = " << n_parts;
2668  myfile << " number of blocks per part = " << n_blocks_per_part;
2669  myfile << " number of rows = " << n_rows;
2670  myfile << " number of cols = " << n_cols;
2671  myfile << " number of packs = " << n_packs << std::endl;
2672 
2673  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2674 
2675  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2676  (void)current_row_offset;
2677  (void)current_part_idx;
2678  for (local_ordinal_type j_in_block = 0; j_in_block < n_cols; ++j_in_block) {
2679  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2680  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2681  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2682  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2683  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2684 
2685  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(0))
2686  continue;
2687  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2688  myfile << scalar_values(current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2689  }
2690  }
2691  }
2692  }
2693  }
2694  myfile.close();
2695 #endif
2696 }
2697 
2698 template <typename local_ordinal_type, typename ViewType>
2699 void write5DMultiVectorValuesToFile(const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2700 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2701  auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2702  std::ofstream myfile;
2703  myfile.open(fileName);
2704 
2705  const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2706  const local_ordinal_type n_blocks = scalar_values.extent(1) * n_parts_per_pack;
2707  const local_ordinal_type n_blocks_per_part = n_blocks / n_parts;
2708 
2709  const local_ordinal_type block_size = scalar_values.extent(2);
2710  const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2711  const local_ordinal_type n_cols = n_blocks_cols * block_size;
2712 
2713  const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2714  const local_ordinal_type n_rows = n_rows_per_part * n_parts;
2715 
2716  const local_ordinal_type n_packs = ceil(float(n_parts) / n_parts_per_pack);
2717 
2718  myfile << "%%MatrixMarket matrix array real general" << std::endl;
2719  myfile << "%%block size = " << block_size;
2720  myfile << " number of blocks = " << n_blocks;
2721  myfile << " number of parts = " << n_parts;
2722  myfile << " number of blocks per part = " << n_blocks_per_part;
2723  myfile << " number of rows = " << n_rows;
2724  myfile << " number of cols = " << n_cols;
2725  myfile << " number of packs = " << n_packs << std::endl;
2726 
2727  myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2728 
2729  local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2730  (void)current_row_offset;
2731  (void)current_part_idx;
2732  for (local_ordinal_type i_block_col = 0; i_block_col < n_blocks_cols; ++i_block_col) {
2733  for (local_ordinal_type j_in_block = 0; j_in_block < block_size; ++j_in_block) {
2734  for (local_ordinal_type i_pack = 0; i_pack < n_packs; ++i_pack) {
2735  for (local_ordinal_type i_part_in_pack = 0; i_part_in_pack < n_parts_per_pack; ++i_part_in_pack) {
2736  current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2737  for (local_ordinal_type i_block_in_part = 0; i_block_in_part < n_blocks_per_part; ++i_block_in_part) {
2738  current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2739 
2740  if (current_block_idx >= (local_ordinal_type)scalar_values.extent(1))
2741  continue;
2742  for (local_ordinal_type i_in_block = 0; i_in_block < block_size; ++i_in_block) {
2743  myfile << scalar_values(i_block_col, current_block_idx, i_in_block, j_in_block, i_part_in_pack) << std::endl;
2744  }
2745  }
2746  }
2747  }
2748  }
2749  }
2750  myfile.close();
2751 #endif
2752 }
2753 
2754 template <typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2755 KOKKOS_INLINE_FUNCTION void
2756 copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2757  /*
2758  // Kokkos::Experimental::local_deep_copy
2759  auto teamVectorRange =
2760  Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2761  member, view1.extent(0), view1.extent(1), view1.extent(2));
2762 
2763  Kokkos::parallel_for
2764  (teamVectorRange,
2765  [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2766  view1(i,j,k) = view2(i,j,k);
2767  });
2768  */
2769  Kokkos::Experimental::local_deep_copy(member, view1, view2);
2770 }
2771 template <typename MatrixType, int ScratchLevel>
2772 struct ExtractAndFactorizeTridiags {
2773  public:
2774  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2775  // a functor cannot have both device_type and execution_space; specialization error in kokkos
2776  using execution_space = typename impl_type::execution_space;
2777  using memory_space = typename impl_type::memory_space;
2779  using local_ordinal_type = typename impl_type::local_ordinal_type;
2780  using size_type = typename impl_type::size_type;
2781  using impl_scalar_type = typename impl_type::impl_scalar_type;
2782  using magnitude_type = typename impl_type::magnitude_type;
2784  using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
2785  using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
2787  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2788  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2789  using size_type_1d_view = typename impl_type::size_type_1d_view;
2790  using size_type_2d_view = typename impl_type::size_type_2d_view;
2791  using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2793  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2794  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2795  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2796  using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2797  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2798  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2799  using btdm_scalar_type_2d_view = typename impl_type::btdm_scalar_type_2d_view;
2800  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
2801  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2802  using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2803  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2804  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2805  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2806  using local_crs_graph_type = typename impl_type::local_crs_graph_type;
2807  using colinds_view = typename local_crs_graph_type::entries_type;
2808 
2809  using internal_vector_type = typename impl_type::internal_vector_type;
2810  static constexpr int vector_length = impl_type::vector_length;
2811  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2812  static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2813  static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2814  // half_vector_length is used for block Jacobi factorization.
2815  // Shared memory requirement is twice as large (per vector lane) as for general tridi factorization, so
2816  // reducing vector length (if possible) keeps the shared requirement constant. This avoids the performance
2817  // cliff of switching from level 0 to level 1 scratch.
2818  static constexpr int half_vector_length = impl_type::half_vector_length;
2819 
2821  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2822  using member_type = typename team_policy_type::member_type;
2823 
2824  private:
2825  // part interface
2826  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2827  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2828  const local_ordinal_type max_partsz;
2829  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2830  using size_type_1d_view_tpetra = Kokkos::View<size_t *, typename impl_type::node_device_type>;
2831  ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2832  ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2833  ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2834  // block tridiags
2835  const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2836  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2837  const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2838  const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2839  const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2840  const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2841  const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2842  const Unmanaged<size_type_1d_view> diag_offsets;
2843  // shared information
2844  const local_ordinal_type blocksize, blocksize_square;
2845  // diagonal safety
2846  const magnitude_type tiny;
2847  const local_ordinal_type vector_loop_size;
2848 
2849  bool hasBlockCrsMatrix;
2850 
2851  public:
2852  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2853  const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2856  const magnitude_type &tiny_)
2857  : // interface
2858  partptr(interf_.partptr)
2859  , lclrow(interf_.lclrow)
2860  , packptr(interf_.packptr)
2861  , packindices_sub(interf_.packindices_sub)
2862  , packptr_sub(interf_.packptr_sub)
2863  , partptr_sub(interf_.partptr_sub)
2864  , part2packrowidx0_sub(interf_.part2packrowidx0_sub)
2865  , packindices_schur(interf_.packindices_schur)
2866  , max_partsz(interf_.max_partsz)
2867  ,
2868  // block tridiags
2869  pack_td_ptr(btdm_.pack_td_ptr)
2870  , flat_td_ptr(btdm_.flat_td_ptr)
2871  , pack_td_ptr_schur(btdm_.pack_td_ptr_schur)
2872  , A_colindsub(btdm_.A_colindsub)
2873  , internal_vector_values((internal_vector_type *)btdm_.values.data(),
2874  btdm_.values.extent(0),
2875  btdm_.values.extent(1),
2876  btdm_.values.extent(2),
2877  vector_length / internal_vector_length)
2878  , internal_vector_values_schur((internal_vector_type *)btdm_.values_schur.data(),
2879  btdm_.values_schur.extent(0),
2880  btdm_.values_schur.extent(1),
2881  btdm_.values_schur.extent(2),
2882  vector_length / internal_vector_length)
2883  , e_internal_vector_values((internal_vector_type *)btdm_.e_values.data(),
2884  btdm_.e_values.extent(0),
2885  btdm_.e_values.extent(1),
2886  btdm_.e_values.extent(2),
2887  btdm_.e_values.extent(3),
2888  vector_length / internal_vector_length)
2889  , scalar_values((btdm_scalar_type *)btdm_.values.data(),
2890  btdm_.values.extent(0),
2891  btdm_.values.extent(1),
2892  btdm_.values.extent(2),
2893  vector_length)
2894  , scalar_values_schur((btdm_scalar_type *)btdm_.values_schur.data(),
2895  btdm_.values_schur.extent(0),
2896  btdm_.values_schur.extent(1),
2897  btdm_.values_schur.extent(2),
2898  vector_length)
2899  , e_scalar_values((btdm_scalar_type *)btdm_.e_values.data(),
2900  btdm_.e_values.extent(0),
2901  btdm_.e_values.extent(1),
2902  btdm_.e_values.extent(2),
2903  btdm_.e_values.extent(3),
2904  vector_length)
2905  , d_inv(btdm_.d_inv)
2906  , diag_offsets(btdm_.diag_offsets)
2907  , blocksize(btdm_.values.extent(1))
2908  , blocksize_square(blocksize * blocksize)
2909  ,
2910  // diagonal weight to avoid zero pivots
2911  tiny(tiny_)
2912  , vector_loop_size(vector_length / internal_vector_length) {
2913  using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
2914  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
2915 
2916  auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
2917  auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
2918 
2919  hasBlockCrsMatrix = !A_bcrs.is_null();
2920 
2921  A_block_rowptr = G_->getLocalGraphDevice().row_map;
2922  if (hasBlockCrsMatrix) {
2923  A_values = const_cast<block_crs_matrix_type *>(A_bcrs.get())->getValuesDeviceNonConst();
2924  } else {
2925  A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2926  A_values = A_crs->getLocalValuesDevice(Tpetra::Access::ReadOnly);
2927  }
2928  }
2929 
2930  private:
2931  KOKKOS_INLINE_FUNCTION
2932  void
2933  extract(local_ordinal_type partidx,
2934  local_ordinal_type local_subpartidx,
2935  local_ordinal_type npacks) const {
2936 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2937  printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2938 #endif
2939  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2940  const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2941  local_ordinal_type kfs[vector_length] = {};
2942  local_ordinal_type ri0[vector_length] = {};
2943  local_ordinal_type nrows[vector_length] = {};
2944 
2945  for (local_ordinal_type vi = 0; vi < npacks; ++vi, ++partidx) {
2946  kfs[vi] = flat_td_ptr(partidx, local_subpartidx);
2947  ri0[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 0);
2948  nrows[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidx, 1) - ri0[vi];
2949 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2950  printf("kfs[%d] = %d;\n", vi, kfs[vi]);
2951  printf("ri0[%d] = %d;\n", vi, ri0[vi]);
2952  printf("nrows[%d] = %d;\n", vi, nrows[vi]);
2953 #endif
2954  }
2955  local_ordinal_type tr_min = 0;
2956  local_ordinal_type tr_max = nrows[0];
2957  if (local_subpartidx % 2 == 1) {
2958  tr_min -= 1;
2959  tr_max += 1;
2960  }
2961 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2962  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2963 #endif
2964  for (local_ordinal_type tr = tr_min, j = 0; tr < tr_max; ++tr) {
2965  for (local_ordinal_type e = 0; e < 3; ++e) {
2966  if (hasBlockCrsMatrix) {
2967  const impl_scalar_type *block[vector_length] = {};
2968  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
2969  const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2970 
2971  block[vi] = &A_values(Aj * blocksize_square);
2972  }
2973  const size_type pi = kps + j;
2974 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2975  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2976 #endif
2977  ++j;
2978  for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
2979  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
2980  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2981  auto &v = internal_vector_values(pi, ii, jj, 0);
2982  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
2983  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2984  }
2985  }
2986  }
2987  } else {
2988  const size_type pi = kps + j;
2989 
2990  for (local_ordinal_type vi = 0; vi < npacks; ++vi) {
2991  const size_type Aj_c = A_colindsub(kfs[vi] + j);
2992 
2993  for (local_ordinal_type ii = 0; ii < blocksize; ++ii) {
2994  auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr) * blocksize + ii);
2995 
2996  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
2997  scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c * blocksize + jj);
2998  }
2999  }
3000  }
3001  ++j;
3002  }
3003  if (nrows[0] == 1) break;
3004  if (local_subpartidx % 2 == 0) {
3005  if (e == 1 && (tr == 0 || tr + 1 == nrows[0])) break;
3006  for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3007  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr + 1 == nrows[vi])) {
3008  npacks = vi;
3009  break;
3010  }
3011  }
3012  } else {
3013  if (e == 0 && (tr == -1 || tr == nrows[0])) break;
3014  for (local_ordinal_type vi = 1; vi < npacks; ++vi) {
3015  if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3016  npacks = vi;
3017  break;
3018  }
3019  }
3020  }
3021  }
3022  }
3023  }
3024 
3025  KOKKOS_INLINE_FUNCTION
3026  void
3027  extract(const member_type &member,
3028  const local_ordinal_type &partidxbeg,
3029  local_ordinal_type local_subpartidx,
3030  const local_ordinal_type &npacks,
3031  const local_ordinal_type &vbeg) const {
3032 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3033  printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3034 #endif
3035  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3036  local_ordinal_type kfs_vals[internal_vector_length] = {};
3037  local_ordinal_type ri0_vals[internal_vector_length] = {};
3038  local_ordinal_type nrows_vals[internal_vector_length] = {};
3039 
3040  const size_type kps = pack_td_ptr(partidxbeg, local_subpartidx);
3041  for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3042  kfs_vals[vi] = flat_td_ptr(partidxbeg + vi, local_subpartidx);
3043  ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 0);
3044  nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0) * local_subpartidx + partidxbeg + vi, 1) - ri0_vals[vi];
3045 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3046  printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3047  printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3048  printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3049 #endif
3050  }
3051 
3052  local_ordinal_type j_vals[internal_vector_length] = {};
3053 
3054  local_ordinal_type tr_min = 0;
3055  local_ordinal_type tr_max = nrows_vals[0];
3056  if (local_subpartidx % 2 == 1) {
3057  tr_min -= 1;
3058  tr_max += 1;
3059  }
3060 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3061  printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3062 #endif
3063  for (local_ordinal_type tr = tr_min; tr < tr_max; ++tr) {
3064  for (local_ordinal_type v = vbeg, vi = 0; v < npacks && vi < internal_vector_length; ++v, ++vi) {
3065  const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3066  if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows + 1)) {
3067  auto &j = j_vals[vi];
3068  const local_ordinal_type kfs = kfs_vals[vi];
3069  const local_ordinal_type ri0 = ri0_vals[vi];
3070  local_ordinal_type lbeg, lend;
3071  if (local_subpartidx % 2 == 0) {
3072  lbeg = (tr == tr_min ? 1 : 0);
3073  lend = (tr == nrows - 1 ? 2 : 3);
3074  } else {
3075  lbeg = 0;
3076  lend = 3;
3077  if (tr == tr_min) {
3078  lbeg = 1;
3079  lend = 2;
3080  } else if (tr == nrows) {
3081  lbeg = 0;
3082  lend = 1;
3083  }
3084  }
3085  if (hasBlockCrsMatrix) {
3086  for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3087  const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3088  const impl_scalar_type *block = &A_values(Aj * blocksize_square);
3089  const size_type pi = kps + j;
3090 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3091  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3092 #endif
3093  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3094  [&](const local_ordinal_type &ii) {
3095  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3096  scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii, jj, blocksize)]);
3097  }
3098  });
3099  }
3100  } else {
3101  for (local_ordinal_type l = lbeg; l < lend; ++l, ++j) {
3102  const size_type Aj_c = A_colindsub(kfs + j);
3103  const size_type pi = kps + j;
3104  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
3105  [&](const local_ordinal_type &ii) {
3106  auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr) * blocksize + ii);
3107  for (local_ordinal_type jj = 0; jj < blocksize; ++jj) {
3108  scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c * blocksize + jj);
3109  }
3110  });
3111  }
3112  }
3113  }
3114  }
3115  }
3116  }
3117 
3118  template <typename AAViewType,
3119  typename WWViewType>
3120  KOKKOS_INLINE_FUNCTION void
3121  factorize_subline(const member_type &member,
3122  const local_ordinal_type &i0,
3123  const local_ordinal_type &nrows,
3124  const local_ordinal_type &v,
3125  const AAViewType &AA,
3126  const WWViewType &WW) const {
3127  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3128 
3129  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3130  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3131 
3132  // constant
3133  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3134 
3135 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3136  printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3137 #endif
3138 
3139  // subview pattern
3140  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3141  KB::LU<member_type,
3142  default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3143 
3144  if (nrows > 1) {
3145  auto B = A;
3146  auto C = A;
3147  local_ordinal_type i = i0;
3148  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
3149 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3150  printf("tr = %d, i = %d;\n", tr, i);
3151 #endif
3152  B.assign_data(&AA(i + 1, 0, 0, v));
3153  KB::Trsm<member_type,
3154  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3155  default_mode_type, default_algo_type>::invoke(member, one, A, B);
3156  C.assign_data(&AA(i + 2, 0, 0, v));
3157  KB::Trsm<member_type,
3158  KB::Side::Right, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3159  default_mode_type, default_algo_type>::invoke(member, one, A, C);
3160  A.assign_data(&AA(i + 3, 0, 0, v));
3161 
3162  member.team_barrier();
3163  KB::Gemm<member_type,
3164  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3165  default_mode_type, default_algo_type>::invoke(member, -one, C, B, one, A);
3166  KB::LU<member_type,
3167  default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
3168  }
3169  } else {
3170  // for block jacobi invert a matrix here
3171  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3172  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, A, W);
3173  KB::SetIdentity<member_type, default_mode_type>::invoke(member, A);
3174  member.team_barrier();
3175  KB::Trsm<member_type,
3176  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3177  default_mode_type, default_algo_type>::invoke(member, one, W, A);
3178  KB::Trsm<member_type,
3179  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3180  default_mode_type, default_algo_type>::invoke(member, one, W, A);
3181  }
3182  }
3183 
3184  public:
3185  struct ExtractAndFactorizeSubLineTag {};
3186  struct ExtractAndFactorizeFusedJacobiTag {};
3187  struct ExtractBCDTag {};
3188  struct ComputeETag {};
3189  struct ComputeSchurTag {};
3190  struct FactorizeSchurTag {};
3191 
3192  KOKKOS_INLINE_FUNCTION
3193  void
3194  operator()(const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3195  // btdm is packed and sorted from largest one
3196  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3197 
3198  const local_ordinal_type subpartidx = packptr_sub(packidx);
3199  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3200  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3201  const local_ordinal_type partidx = subpartidx % n_parts;
3202 
3203  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3204  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3205  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3206 
3207  internal_vector_scratch_type_3d_view
3208  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3209 
3210 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3211  printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3212  printf("vector_loop_size = %d\n", vector_loop_size);
3213 #endif
3214 
3215  if (vector_loop_size == 1) {
3216  extract(partidx, local_subpartidx, npacks);
3217  factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3218  } else {
3219  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3220  [&](const local_ordinal_type &v) {
3221  const local_ordinal_type vbeg = v * internal_vector_length;
3222 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3223  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3224 #endif
3225  if (vbeg < npacks)
3226  extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3227  // this is not safe if vector loop size is different from vector size of
3228  // the team policy. we always make sure this when constructing the team policy
3229  member.team_barrier();
3230  factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3231  });
3232  }
3233  }
3234 
3235  KOKKOS_INLINE_FUNCTION
3236  void
3237  operator()(const ExtractAndFactorizeFusedJacobiTag &, const member_type &member) const {
3238  using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3239  using default_mode_type = typename default_mode_and_algo_type::mode_type;
3240  using default_algo_type = typename default_mode_and_algo_type::algo_type;
3241  // When fused block Jacobi can be used, the mapping between local rows and parts is trivial (i <-> i)
3242  // We can simply pull the diagonal entry from A into d_inv
3243  btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3244  btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3245  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3246  const local_ordinal_type nrows = lclrow.extent(0);
3247  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, half_vector_length),
3248  [&](const local_ordinal_type &v) {
3249  local_ordinal_type row = member.league_rank() * half_vector_length + v;
3250  // diagEntry has index of diagonal within row
3251  auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3252  auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3253  if (row < nrows) {
3254  // View the diagonal block of A in row as 2D row-major
3255  const impl_scalar_type *A_diag = A_values.data() + diag_offsets(row);
3256  // Copy the diag into scratch slice W1
3257  // (copying elements directly is better than KokkosBatched copy)
3258  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3259  [&](int i) {
3260  W1.data()[i] = A_diag[i];
3261  });
3262  // and set W2 to identity in preparation to invert with 2 x Trsm
3263  KB::SetIdentity<member_type, default_mode_type>::invoke(member, W2);
3264  } else {
3265  // if this vector lane has no block to invert, then set W1 to identity
3266  // so that LU still has a matrix to work on. LU uses team barriers so
3267  // having some lanes run it and some not will deadlock.
3268  KB::SetIdentity<member_type, default_mode_type>::invoke(member, W1);
3269  }
3270  member.team_barrier();
3271  // LU factorize in-place
3272  KB::LU<member_type, default_mode_type, KB::Algo::LU::Unblocked>::invoke(member, W1, tiny);
3273  member.team_barrier();
3274  KB::Trsm<member_type,
3275  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
3276  default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3277  KB::Trsm<member_type,
3278  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
3279  default_mode_type, default_algo_type>::invoke(member, one, W1, W2);
3280  member.team_barrier();
3281  if (row < nrows) {
3282  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3283  [&](int i) {
3284  auto d_inv_block = &d_inv(row, 0, 0);
3285  d_inv_block[i] = W2.data()[i];
3286  });
3287  }
3288  });
3289  }
3290 
3291  KOKKOS_INLINE_FUNCTION
3292  void
3293  operator()(const ExtractBCDTag &, const member_type &member) const {
3294  // btdm is packed and sorted from largest one
3295  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3296  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3297  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3298 
3299  const local_ordinal_type subpartidx = packptr_sub(packidx);
3300  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3301  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3302  const local_ordinal_type partidx = subpartidx % n_parts;
3303 
3304  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3305  // const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3306  // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3307 
3308  if (vector_loop_size == 1) {
3309  extract(partidx, local_subpartidx, npacks);
3310  } else {
3311  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3312  [&](const local_ordinal_type &v) {
3313  const local_ordinal_type vbeg = v * internal_vector_length;
3314 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3315  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3316  printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3317 #endif
3318  if (vbeg < npacks)
3319  extract(member, partidx + vbeg, local_subpartidx, npacks, vbeg);
3320  });
3321  }
3322 
3323  member.team_barrier();
3324 
3325  const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3326  const size_type kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 1;
3327 
3328  const local_ordinal_type r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3329  const local_ordinal_type r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3330 
3331 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3332  printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3333 #endif
3334 
3335  // Need to copy D to e_internal_vector_values.
3336  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3337  Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3338 
3339  copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3340  Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3341  }
3342 
3343  KOKKOS_INLINE_FUNCTION
3344  void
3345  operator()(const ComputeETag &, const member_type &member) const {
3346  // btdm is packed and sorted from largest one
3347  const local_ordinal_type packidx = packindices_sub(member.league_rank());
3348 
3349  const local_ordinal_type subpartidx = packptr_sub(packidx);
3350  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3351  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3352  const local_ordinal_type partidx = subpartidx % n_parts;
3353 
3354  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
3355  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3356  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
3357  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
3358  const local_ordinal_type num_vectors = blocksize;
3359 
3360  (void)npacks;
3361 
3362  internal_vector_scratch_type_3d_view
3363  WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3364  if (local_subpartidx == 0) {
3365  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3366  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3367  });
3368  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
3369  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3370  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3371  });
3372  } else {
3373  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3374  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3375  solveMultiVector<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3376  });
3377  }
3378  }
3379 
3380  KOKKOS_INLINE_FUNCTION
3381  void
3382  operator()(const ComputeSchurTag &, const member_type &member) const {
3383  // btdm is packed and sorted from largest one
3384  const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3385  const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3386  const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3387 
3388  const local_ordinal_type subpartidx = packptr_sub(packidx);
3389  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3390  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
3391  const local_ordinal_type partidx = subpartidx % n_parts;
3392 
3393  // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3394  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
3395  // const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3396  // const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3397 
3398  // Compute S = D - C E
3399 
3400  const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
3401  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
3402  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
3403 
3404  for (local_ordinal_type i = 0; i < 4; ++i) { // pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3405  copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3406  Kokkos::subview(internal_vector_values, i0_offset + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3407  }
3408 
3409  member.team_barrier();
3410 
3411  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3412 
3413  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx) + 1;
3414  const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx + 1) - 2;
3415 
3416  const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx, local_subpartidx) - 1;
3417  const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx, local_subpartidx) + 2;
3418 
3419  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
3420 
3421  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3422  typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3423 
3424  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
3425  for (size_type i = 0; i < pack_td_ptr_schur(partidx, local_subpartidx_schur + 1) - pack_td_ptr_schur(partidx, local_subpartidx_schur); ++i) {
3426  local_ordinal_type e_r, e_c, c_kps;
3427 
3428  if (local_subpartidx_schur == 0) {
3429  if (i == 0) {
3430  e_r = e_r1;
3431  e_c = 0;
3432  c_kps = c_kps1;
3433  } else if (i == 3) {
3434  e_r = e_r2;
3435  e_c = 1;
3436  c_kps = c_kps2;
3437  } else if (i == 4) {
3438  e_r = e_r2;
3439  e_c = 0;
3440  c_kps = c_kps2;
3441  } else {
3442  continue;
3443  }
3444  } else {
3445  if (i == 0) {
3446  e_r = e_r1;
3447  e_c = 1;
3448  c_kps = c_kps1;
3449  } else if (i == 1) {
3450  e_r = e_r1;
3451  e_c = 0;
3452  c_kps = c_kps1;
3453  } else if (i == 4) {
3454  e_r = e_r2;
3455  e_c = 1;
3456  c_kps = c_kps2;
3457  } else if (i == 5) {
3458  e_r = e_r2;
3459  e_c = 0;
3460  c_kps = c_kps2;
3461  } else {
3462  continue;
3463  }
3464  }
3465 
3466  auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx, local_subpartidx_schur) + i, Kokkos::ALL(), Kokkos::ALL(), v);
3467  auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3468  auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3469  KB::Gemm<member_type,
3470  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
3471  default_mode_type, default_algo_type>::invoke(member, -one, C, E, one, S);
3472  }
3473  });
3474  }
3475 
3476  KOKKOS_INLINE_FUNCTION
3477  void
3478  operator()(const FactorizeSchurTag &, const member_type &member) const {
3479  const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3480 
3481  const local_ordinal_type subpartidx = packptr_sub(packidx);
3482 
3483  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3484  const local_ordinal_type partidx = subpartidx % n_parts;
3485 
3486  const local_ordinal_type i0 = pack_td_ptr_schur(partidx, 0);
3487  const local_ordinal_type nrows = 2 * (pack_td_ptr_schur.extent(1) - 1);
3488 
3489  internal_vector_scratch_type_3d_view
3490  WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3491 
3492 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3493  printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3494 #endif
3495 
3496  if (vector_loop_size == 1) {
3497  factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3498  } else {
3499  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),
3500  [&](const local_ordinal_type &v) {
3501  factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3502  });
3503  }
3504  }
3505 
3506  void run() {
3507  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3508  const local_ordinal_type team_size =
3509  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3510  recommended_team_size(blocksize, vector_length, internal_vector_length);
3511  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3512  shmem_size(blocksize, blocksize, vector_loop_size);
3513 
3514  {
3515 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3516  printf("Start ExtractAndFactorizeSubLineTag\n");
3517 #endif
3518  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3519  Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeSubLineTag>
3520  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3521 
3522  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3523  writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3524 
3525  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3526  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3527  policy, *this);
3528  execution_space().fence();
3529 
3530  writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3531 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3532  printf("End ExtractAndFactorizeSubLineTag\n");
3533 #endif
3534  }
3535 
3536  if (packindices_schur.extent(1) > 0) {
3537  {
3538 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3539  printf("Start ExtractBCDTag\n");
3540 #endif
3541  Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3542  Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3543 
3544  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3545 
3546  {
3547  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3548  Kokkos::TeamPolicy<execution_space, ExtractBCDTag>
3549  policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3550 
3551  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3552  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3553  policy, *this);
3554  execution_space().fence();
3555  }
3556 
3557 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3558  printf("End ExtractBCDTag\n");
3559 #endif
3560  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3561 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3562  printf("Start ComputeETag\n");
3563 #endif
3564  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3565  {
3566  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3567  Kokkos::TeamPolicy<execution_space, ComputeETag>
3568  policy(packindices_sub.extent(0), team_size, vector_loop_size);
3569 
3570  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3571  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3572  policy, *this);
3573  execution_space().fence();
3574  }
3575  write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3576 
3577 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3578  printf("End ComputeETag\n");
3579 #endif
3580  }
3581 
3582  {
3583 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3584  printf("Start ComputeSchurTag\n");
3585 #endif
3586  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3587  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3588  Kokkos::TeamPolicy<execution_space, ComputeSchurTag>
3589  policy(packindices_schur.extent(0) * packindices_schur.extent(1), team_size, vector_loop_size);
3590 
3591  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3592  policy, *this);
3593  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3594  execution_space().fence();
3595 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3596  printf("End ComputeSchurTag\n");
3597 #endif
3598  }
3599 
3600  {
3601 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3602  printf("Start FactorizeSchurTag\n");
3603 #endif
3604  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3605  Kokkos::TeamPolicy<execution_space, FactorizeSchurTag>
3606  policy(packindices_schur.extent(0), team_size, vector_loop_size);
3607  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3608  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3609  policy, *this);
3610  execution_space().fence();
3611  writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3612 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3613  printf("End FactorizeSchurTag\n");
3614 #endif
3615  }
3616  }
3617 
3618  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3619  }
3620 
3621  void run_fused_jacobi() {
3622  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3623  const local_ordinal_type team_size =
3624  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3625  recommended_team_size(blocksize, half_vector_length, 1);
3626  const local_ordinal_type per_team_scratch =
3627  btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3628  {
3629  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3630  Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3631  policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3632 
3633  policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3634  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3635  policy, *this);
3636  }
3637  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3638  }
3639 };
3640 
3644 template <typename MatrixType>
3645 void performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3646  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3647  const BlockHelperDetails::PartInterface<MatrixType> &interf,
3649  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3650  bool use_fused_jacobi) {
3651  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3652  using execution_space = typename impl_type::execution_space;
3653  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3654  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3655  using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3656 
3657  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
3658 
3659  int blocksize = btdm.values.extent(1);
3660  // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
3661  // For large block sizes, have to fall back to level 1 scratch.
3662  int scratch_required;
3663  if (!use_fused_jacobi) {
3664  // General path scratch requirement
3665  scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3666  } else {
3667  // Block Jacobi scratch requirement: measured in scalars, and uses twice as much (in bytes) per vector lane as the general path.
3668  scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3669  }
3670 
3671  int max_scratch = team_policy_type::scratch_size_max(0);
3672 
3673  if (scratch_required < max_scratch) {
3674  // Can use level 0 scratch
3675  ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3676  if (!use_fused_jacobi)
3677  function.run();
3678  else
3679  function.run_fused_jacobi();
3680  } else {
3681  // Not enough level 0 scratch, so fall back to level 1
3682  ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3683  if (!use_fused_jacobi)
3684  function.run();
3685  else
3686  function.run_fused_jacobi();
3687  }
3688  IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3689 }
3690 
3694 template <typename MatrixType>
3696  public:
3698  using execution_space = typename impl_type::execution_space;
3699  using memory_space = typename impl_type::memory_space;
3700 
3701  using local_ordinal_type = typename impl_type::local_ordinal_type;
3702  using impl_scalar_type = typename impl_type::impl_scalar_type;
3703  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3704  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3705  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3706  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3707  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3708  using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3709  static constexpr int vector_length = impl_type::vector_length;
3710 
3711  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3712 
3713  private:
3714  // part interface
3715  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3716  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3717  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3718  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3719  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3720  const local_ordinal_type blocksize;
3721  const local_ordinal_type num_vectors;
3722 
3723  // packed multivector output (or input)
3724  vector_type_3d_view packed_multivector;
3725  const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3726 
3727  template <typename TagType>
3728  KOKKOS_INLINE_FUNCTION void copy_multivectors(const local_ordinal_type &j,
3729  const local_ordinal_type &vi,
3730  const local_ordinal_type &pri,
3731  const local_ordinal_type &ri0) const {
3732  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3733  for (local_ordinal_type i = 0; i < blocksize; ++i)
3734  packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3735  }
3736 
3737  public:
3738  MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3739  const vector_type_3d_view &pmv)
3740  : partptr(interf.partptr)
3741  , packptr(interf.packptr)
3742  , part2packrowidx0(interf.part2packrowidx0)
3743  , part2rowidx0(interf.part2rowidx0)
3744  , lclrow(interf.lclrow)
3745  , blocksize(pmv.extent(1))
3746  , num_vectors(pmv.extent(2))
3747  , packed_multivector(pmv) {}
3748 
3749  // TODO:: modify this routine similar to the team level functions
3750  KOKKOS_INLINE_FUNCTION
3751  void
3752  operator()(const local_ordinal_type &packidx) const {
3753  local_ordinal_type partidx = packptr(packidx);
3754  local_ordinal_type npacks = packptr(packidx + 1) - partidx;
3755  const local_ordinal_type pri0 = part2packrowidx0(partidx);
3756 
3757  local_ordinal_type ri0[vector_length] = {};
3758  local_ordinal_type nrows[vector_length] = {};
3759  for (local_ordinal_type v = 0; v < npacks; ++v, ++partidx) {
3760  ri0[v] = part2rowidx0(partidx);
3761  nrows[v] = part2rowidx0(partidx + 1) - ri0[v];
3762  }
3763  for (local_ordinal_type j = 0; j < nrows[0]; ++j) {
3764  local_ordinal_type cnt = 1;
3765  for (; cnt < npacks && j != nrows[cnt]; ++cnt)
3766  ;
3767  npacks = cnt;
3768  const local_ordinal_type pri = pri0 + j;
3769  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3770  for (local_ordinal_type i = 0; i < blocksize; ++i)
3771  for (local_ordinal_type v = 0; v < npacks; ++v)
3772  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0[v] + j) + i, col));
3773  }
3774  }
3775 
3776  KOKKOS_INLINE_FUNCTION
3777  void
3778  operator()(const member_type &member) const {
3779  const local_ordinal_type packidx = member.league_rank();
3780  const local_ordinal_type partidx_begin = packptr(packidx);
3781  const local_ordinal_type npacks = packptr(packidx + 1) - partidx_begin;
3782  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3783  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3784  const local_ordinal_type partidx = partidx_begin + v;
3785  const local_ordinal_type ri0 = part2rowidx0(partidx);
3786  const local_ordinal_type nrows = part2rowidx0(partidx + 1) - ri0;
3787 
3788  if (nrows == 1) {
3789  const local_ordinal_type pri = pri0;
3790  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
3791  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3792  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0) + i, col));
3793  });
3794  }
3795  } else {
3796  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3797  const local_ordinal_type pri = pri0 + j;
3798  for (local_ordinal_type col = 0; col < num_vectors; ++col)
3799  for (local_ordinal_type i = 0; i < blocksize; ++i)
3800  packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize * lclrow(ri0 + j) + i, col));
3801  });
3802  }
3803  });
3804  }
3805 
3806  void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3807  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3808  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3809 
3810  scalar_multivector = scalar_multivector_;
3811  if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3812  const local_ordinal_type vl = vector_length;
3813  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3814  Kokkos::parallel_for("MultiVectorConverter::TeamPolicy", policy, *this);
3815  } else {
3816  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3817  Kokkos::parallel_for("MultiVectorConverter::RangePolicy", policy, *this);
3818  }
3819  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3820  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3821  }
3822 };
3823 
3827 
3828 template <>
3829 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3830  typedef KB::Mode::Serial mode_type;
3831  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3832 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3833  typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3834 #else
3835  typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3836 #endif
3837  static int recommended_team_size(const int /* blksize */,
3838  const int /* vector_length */,
3839  const int /* internal_vector_length */) {
3840  return 1;
3841  }
3842 };
3843 
3844 #if defined(KOKKOS_ENABLE_CUDA)
3845 static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3846  const int vector_length,
3847  const int internal_vector_length) {
3848  const int vector_size = vector_length / internal_vector_length;
3849  int total_team_size(0);
3850  if (blksize <= 5)
3851  total_team_size = 32;
3852  else if (blksize <= 9)
3853  total_team_size = 32; // 64
3854  else if (blksize <= 12)
3855  total_team_size = 96;
3856  else if (blksize <= 16)
3857  total_team_size = 128;
3858  else if (blksize <= 20)
3859  total_team_size = 160;
3860  else
3861  total_team_size = 160;
3862  return total_team_size / vector_size;
3863 }
3864 
3865 template <>
3866 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3867  typedef KB::Mode::Team mode_type;
3868  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3869  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3870  static int recommended_team_size(const int blksize,
3871  const int vector_length,
3872  const int internal_vector_length) {
3873  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3874  }
3875 };
3876 template <>
3877 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3878  typedef KB::Mode::Team mode_type;
3879  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3880  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3881  static int recommended_team_size(const int blksize,
3882  const int vector_length,
3883  const int internal_vector_length) {
3884  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3885  }
3886 };
3887 #endif
3888 
3889 #if defined(KOKKOS_ENABLE_HIP)
3890 static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
3891  const int vector_length,
3892  const int internal_vector_length) {
3893  const int vector_size = vector_length / internal_vector_length;
3894  int total_team_size(0);
3895  if (blksize <= 5)
3896  total_team_size = 32;
3897  else if (blksize <= 9)
3898  total_team_size = 32; // 64
3899  else if (blksize <= 12)
3900  total_team_size = 96;
3901  else if (blksize <= 16)
3902  total_team_size = 128;
3903  else if (blksize <= 20)
3904  total_team_size = 160;
3905  else
3906  total_team_size = 160;
3907  return total_team_size / vector_size;
3908 }
3909 
3910 template <>
3911 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3912  typedef KB::Mode::Team mode_type;
3913  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3914  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3915  static int recommended_team_size(const int blksize,
3916  const int vector_length,
3917  const int internal_vector_length) {
3918  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3919  }
3920 };
3921 template <>
3922 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3923  typedef KB::Mode::Team mode_type;
3924  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3925  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3926  static int recommended_team_size(const int blksize,
3927  const int vector_length,
3928  const int internal_vector_length) {
3929  return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3930  }
3931 };
3932 #endif
3933 
3934 #if defined(KOKKOS_ENABLE_SYCL)
3935 static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
3936  const int vector_length,
3937  const int internal_vector_length) {
3938  const int vector_size = vector_length / internal_vector_length;
3939  int total_team_size(0);
3940  if (blksize <= 5)
3941  total_team_size = 32;
3942  else if (blksize <= 9)
3943  total_team_size = 32; // 64
3944  else if (blksize <= 12)
3945  total_team_size = 96;
3946  else if (blksize <= 16)
3947  total_team_size = 128;
3948  else if (blksize <= 20)
3949  total_team_size = 160;
3950  else
3951  total_team_size = 160;
3952  return total_team_size / vector_size;
3953 }
3954 
3955 template <>
3956 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
3957  typedef KB::Mode::Team mode_type;
3958  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3959  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3960  static int recommended_team_size(const int blksize,
3961  const int vector_length,
3962  const int internal_vector_length) {
3963  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3964  }
3965 };
3966 template <>
3967 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
3968  typedef KB::Mode::Team mode_type;
3969  typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3970  typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3971  static int recommended_team_size(const int blksize,
3972  const int vector_length,
3973  const int internal_vector_length) {
3974  return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3975  }
3976 };
3977 #endif
3978 
3979 template <typename MatrixType>
3980 struct SolveTridiags {
3981  public:
3982  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3983  using execution_space = typename impl_type::execution_space;
3984 
3985  using local_ordinal_type = typename impl_type::local_ordinal_type;
3986  using size_type = typename impl_type::size_type;
3987  using impl_scalar_type = typename impl_type::impl_scalar_type;
3988  using magnitude_type = typename impl_type::magnitude_type;
3989  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3990  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
3992  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3993  using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
3994  using size_type_2d_view = typename impl_type::size_type_2d_view;
3996  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3997  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
3998  using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
3999  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
4000 
4001  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4002 
4003  using internal_vector_type = typename impl_type::internal_vector_type;
4004  static constexpr int vector_length = impl_type::vector_length;
4005  static constexpr int internal_vector_length = impl_type::internal_vector_length;
4006 
4008  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4009  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
4010 
4012  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4013  using member_type = typename team_policy_type::member_type;
4014 
4015  private:
4016  // part interface
4017  local_ordinal_type n_subparts_per_part;
4018  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4019  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4020  const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4021  const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4022  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4023  const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4024  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4025  const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4026 
4027  const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4028  const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4029 
4030  // block tridiags
4031  const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4032 
4033  // block tridiags values
4034  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4035  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4036  const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4037 
4038  internal_vector_type_4d_view X_internal_vector_values_schur;
4039 
4040  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4041  const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4042 
4043  const local_ordinal_type vector_loop_size;
4044 
4045  // copy to multivectors : damping factor and Y_scalar_multivector
4046  Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4047 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4048  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4049 #else
4050  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4051 #endif
4052  const impl_scalar_type df;
4053  const bool compute_diff;
4054 
4055  public:
4056  SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
4057  const BlockTridiags<MatrixType> &btdm,
4058  const vector_type_3d_view &pmv,
4059  const impl_scalar_type damping_factor,
4060  const bool is_norm_manager_active)
4061  : // interface
4062  n_subparts_per_part(interf.n_subparts_per_part)
4063  , partptr(interf.partptr)
4064  , packptr(interf.packptr)
4065  , packindices_sub(interf.packindices_sub)
4066  , packindices_schur(interf.packindices_schur)
4067  , part2packrowidx0(interf.part2packrowidx0)
4068  , part2packrowidx0_sub(interf.part2packrowidx0_sub)
4069  , lclrow(interf.lclrow)
4070  , packptr_sub(interf.packptr_sub)
4071  , partptr_sub(interf.partptr_sub)
4072  , pack_td_ptr_schur(btdm.pack_td_ptr_schur)
4073  ,
4074  // block tridiags and multivector
4075  pack_td_ptr(btdm.pack_td_ptr)
4076  , D_internal_vector_values((internal_vector_type *)btdm.values.data(),
4077  btdm.values.extent(0),
4078  btdm.values.extent(1),
4079  btdm.values.extent(2),
4080  vector_length / internal_vector_length)
4081  , X_internal_vector_values((internal_vector_type *)pmv.data(),
4082  pmv.extent(0),
4083  pmv.extent(1),
4084  pmv.extent(2),
4085  vector_length / internal_vector_length)
4086  , X_internal_scalar_values((btdm_scalar_type *)pmv.data(),
4087  pmv.extent(0),
4088  pmv.extent(1),
4089  pmv.extent(2),
4090  vector_length)
4091  , X_internal_vector_values_schur(do_not_initialize_tag("X_internal_vector_values_schur"),
4092  2 * (n_subparts_per_part - 1) * part2packrowidx0_sub.extent(0),
4093  pmv.extent(1),
4094  pmv.extent(2),
4095  vector_length / internal_vector_length)
4096  , D_internal_vector_values_schur((internal_vector_type *)btdm.values_schur.data(),
4097  btdm.values_schur.extent(0),
4098  btdm.values_schur.extent(1),
4099  btdm.values_schur.extent(2),
4100  vector_length / internal_vector_length)
4101  , e_internal_vector_values((internal_vector_type *)btdm.e_values.data(),
4102  btdm.e_values.extent(0),
4103  btdm.e_values.extent(1),
4104  btdm.e_values.extent(2),
4105  btdm.e_values.extent(3),
4106  vector_length / internal_vector_length)
4107  , vector_loop_size(vector_length / internal_vector_length)
4108  , Y_scalar_multivector()
4109  , Z_scalar_vector()
4110  , df(damping_factor)
4111  , compute_diff(is_norm_manager_active) {}
4112 
4113  public:
4115  KOKKOS_INLINE_FUNCTION
4116  void
4117  copyToFlatMultiVector(const member_type &member,
4118  const local_ordinal_type partidxbeg, // partidx for v = 0
4119  const local_ordinal_type npacks,
4120  const local_ordinal_type pri0,
4121  const local_ordinal_type v, // index with a loop of vector_loop_size
4122  const local_ordinal_type blocksize,
4123  const local_ordinal_type num_vectors) const {
4124  const local_ordinal_type vbeg = v * internal_vector_length;
4125  if (vbeg < npacks) {
4126  local_ordinal_type ri0_vals[internal_vector_length] = {};
4127  local_ordinal_type nrows_vals[internal_vector_length] = {};
4128  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4129  const local_ordinal_type partidx = partidxbeg + vv;
4130  ri0_vals[vi] = partptr(partidx);
4131  nrows_vals[vi] = partptr(partidx + 1) - ri0_vals[vi];
4132  }
4133 
4134  impl_scalar_type z_partial_sum(0);
4135  if (nrows_vals[0] == 1) {
4136  const local_ordinal_type j = 0, pri = pri0;
4137  {
4138  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4139  const local_ordinal_type ri0 = ri0_vals[vi];
4140  const local_ordinal_type nrows = nrows_vals[vi];
4141  if (j < nrows) {
4142  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize),
4143  [&](const local_ordinal_type &i) {
4144  const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4145  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4146  impl_scalar_type &y = Y_scalar_multivector(row, col);
4147  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4148  y += df * yd;
4149 
4150  { // if (compute_diff) {
4151  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4152  z_partial_sum += yd_abs * yd_abs;
4153  }
4154  }
4155  });
4156  }
4157  }
4158  }
4159  } else {
4160  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows_vals[0]),
4161  [&](const local_ordinal_type &j) {
4162  const local_ordinal_type pri = pri0 + j;
4163  for (local_ordinal_type vv = vbeg, vi = 0; vv < npacks && vi < internal_vector_length; ++vv, ++vi) {
4164  const local_ordinal_type ri0 = ri0_vals[vi];
4165  const local_ordinal_type nrows = nrows_vals[vi];
4166  if (j < nrows) {
4167  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
4168  for (local_ordinal_type i = 0; i < blocksize; ++i) {
4169  const local_ordinal_type row = blocksize * lclrow(ri0 + j) + i;
4170  impl_scalar_type &y = Y_scalar_multivector(row, col);
4171  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4172  y += df * yd;
4173 
4174  { // if (compute_diff) {
4175  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4176  z_partial_sum += yd_abs * yd_abs;
4177  }
4178  }
4179  }
4180  }
4181  }
4182  });
4183  }
4184  // if (compute_diff)
4185  Z_scalar_vector(member.league_rank()) += z_partial_sum;
4186  }
4187  }
4188 
4192  template <typename WWViewType>
4193  KOKKOS_INLINE_FUNCTION void
4194  solveSingleVector(const member_type &member,
4195  const local_ordinal_type &blocksize,
4196  const local_ordinal_type &i0,
4197  const local_ordinal_type &r0,
4198  const local_ordinal_type &nrows,
4199  const local_ordinal_type &v,
4200  const WWViewType &WW) const {
4201  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4202 
4203  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4204  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4205 
4206  // base pointers
4207  auto A = D_internal_vector_values.data();
4208  auto X = X_internal_vector_values.data();
4209 
4210  // constant
4211  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4212  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4213  // const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4214 
4215  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4216  const local_ordinal_type astep = D_internal_vector_values.stride_0();
4217  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); // blocksize*vector_length;
4218  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); // vector_length;
4219  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4220  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); // vector_length;
4221 
4222  // move to starting point
4223  A += i0 * astep + v;
4224  X += r0 * xstep + v;
4225 
4226  // for (local_ordinal_type col=0;col<num_vectors;++col)
4227  if (nrows > 1) {
4228  // solve Lx = x
4229  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4230  member,
4231  KB::Diag::Unit,
4232  blocksize, blocksize,
4233  one,
4234  A, as0, as1,
4235  X, xs0);
4236 
4237  for (local_ordinal_type tr = 1; tr < nrows; ++tr) {
4238  member.team_barrier();
4239  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4240  member,
4241  blocksize, blocksize,
4242  -one,
4243  A + 2 * astep, as0, as1,
4244  X, xs0,
4245  one,
4246  X + 1 * xstep, xs0);
4247  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4248  member,
4249  KB::Diag::Unit,
4250  blocksize, blocksize,
4251  one,
4252  A + 3 * astep, as0, as1,
4253  X + 1 * xstep, xs0);
4254 
4255  A += 3 * astep;
4256  X += 1 * xstep;
4257  }
4258 
4259  // solve Ux = x
4260  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4261  member,
4262  KB::Diag::NonUnit,
4263  blocksize, blocksize,
4264  one,
4265  A, as0, as1,
4266  X, xs0);
4267 
4268  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4269  A -= 3 * astep;
4270  member.team_barrier();
4271  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4272  member,
4273  blocksize, blocksize,
4274  -one,
4275  A + 1 * astep, as0, as1,
4276  X, xs0,
4277  one,
4278  X - 1 * xstep, xs0);
4279  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4280  member,
4281  KB::Diag::NonUnit,
4282  blocksize, blocksize,
4283  one,
4284  A, as0, as1,
4285  X - 1 * xstep, xs0);
4286  X -= 1 * xstep;
4287  }
4288  // for multiple rhs
4289  // X += xs1;
4290  } else {
4291  const local_ordinal_type ws0 = WW.stride_0();
4292  auto W = WW.data() + v;
4293  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type,
4294  member, blocksize, X, xs0, W, ws0);
4295  member.team_barrier();
4296  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4297  member,
4298  blocksize, blocksize,
4299  one,
4300  A, as0, as1,
4301  W, xs0,
4302  zero,
4303  X, xs0);
4304  }
4305  }
4306 
4307  template <typename WWViewType>
4308  KOKKOS_INLINE_FUNCTION void
4309  solveMultiVector(const member_type &member,
4310  const local_ordinal_type & /* blocksize */,
4311  const local_ordinal_type &i0,
4312  const local_ordinal_type &r0,
4313  const local_ordinal_type &nrows,
4314  const local_ordinal_type &v,
4315  const WWViewType &WW) const {
4316  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4317 
4318  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4319  typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4320 
4321  // constant
4322  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4323  const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4324 
4325  // subview pattern
4326  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4327  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4328  auto X2 = X1;
4329 
4330  local_ordinal_type i = i0, r = r0;
4331 
4332  if (nrows > 1) {
4333  // solve Lx = x
4334  KB::Trsm<member_type,
4335  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4336  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4337  for (local_ordinal_type tr = 1; tr < nrows; ++tr, i += 3) {
4338  A.assign_data(&D_internal_vector_values(i + 2, 0, 0, v));
4339  X2.assign_data(&X_internal_vector_values(++r, 0, 0, v));
4340  member.team_barrier();
4341  KB::Gemm<member_type,
4342  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4343  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4344  A.assign_data(&D_internal_vector_values(i + 3, 0, 0, v));
4345  KB::Trsm<member_type,
4346  KB::Side::Left, KB::Uplo::Lower, KB::Trans::NoTranspose, KB::Diag::Unit,
4347  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4348  X1.assign_data(X2.data());
4349  }
4350 
4351  // solve Ux = x
4352  KB::Trsm<member_type,
4353  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4354  default_mode_type, default_algo_type>::invoke(member, one, A, X1);
4355  for (local_ordinal_type tr = nrows; tr > 1; --tr) {
4356  i -= 3;
4357  A.assign_data(&D_internal_vector_values(i + 1, 0, 0, v));
4358  X2.assign_data(&X_internal_vector_values(--r, 0, 0, v));
4359  member.team_barrier();
4360  KB::Gemm<member_type,
4361  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4362  default_mode_type, default_algo_type>::invoke(member, -one, A, X1, one, X2);
4363 
4364  A.assign_data(&D_internal_vector_values(i, 0, 0, v));
4365  KB::Trsm<member_type,
4366  KB::Side::Left, KB::Uplo::Upper, KB::Trans::NoTranspose, KB::Diag::NonUnit,
4367  default_mode_type, default_algo_type>::invoke(member, one, A, X2);
4368  X1.assign_data(X2.data());
4369  }
4370  } else {
4371  // matrix is already inverted
4372  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4373  KB::Copy<member_type, KB::Trans::NoTranspose, default_mode_type>::invoke(member, X1, W);
4374  member.team_barrier();
4375  KB::Gemm<member_type,
4376  KB::Trans::NoTranspose, KB::Trans::NoTranspose,
4377  default_mode_type, default_algo_type>::invoke(member, one, A, W, zero, X1);
4378  }
4379  }
4380 
4381  template <int B>
4382  struct SingleVectorTag {};
4383  template <int B>
4384  struct MultiVectorTag {};
4385 
4386  template <int B>
4387  struct SingleVectorSubLineTag {};
4388  template <int B>
4389  struct MultiVectorSubLineTag {};
4390  template <int B>
4391  struct SingleVectorApplyCTag {};
4392  template <int B>
4393  struct MultiVectorApplyCTag {};
4394  template <int B>
4395  struct SingleVectorSchurTag {};
4396  template <int B>
4397  struct MultiVectorSchurTag {};
4398  template <int B>
4399  struct SingleVectorApplyETag {};
4400  template <int B>
4401  struct MultiVectorApplyETag {};
4402  template <int B>
4403  struct SingleVectorCopyToFlatTag {};
4404  template <int B>
4405  struct SingleZeroingTag {};
4406 
4407  template <int B>
4408  KOKKOS_INLINE_FUNCTION void
4409  operator()(const SingleVectorTag<B> &, const member_type &member) const {
4410  const local_ordinal_type packidx = member.league_rank();
4411  const local_ordinal_type partidx = packptr(packidx);
4412  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4413  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4414  const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4415  const local_ordinal_type r0 = part2packrowidx0(partidx);
4416  const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4417  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4418  const local_ordinal_type num_vectors = 1;
4419  internal_vector_scratch_type_3d_view
4420  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4421  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4422  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4423  });
4424  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4425  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4426  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4427  });
4428  }
4429 
4430  template <int B>
4431  KOKKOS_INLINE_FUNCTION void
4432  operator()(const MultiVectorTag<B> &, const member_type &member) const {
4433  const local_ordinal_type packidx = member.league_rank();
4434  const local_ordinal_type partidx = packptr(packidx);
4435  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4436  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4437  const local_ordinal_type i0 = pack_td_ptr(partidx, 0);
4438  const local_ordinal_type r0 = part2packrowidx0(partidx);
4439  const local_ordinal_type nrows = partptr(partidx + 1) - partptr(partidx);
4440  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4441  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4442 
4443  internal_vector_scratch_type_3d_view
4444  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4445  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4446  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4447  });
4448  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4449  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4450  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4451  });
4452  }
4453 
4454  template <int B>
4455  KOKKOS_INLINE_FUNCTION void
4456  operator()(const SingleVectorSubLineTag<B> &, const member_type &member) const {
4457  // btdm is packed and sorted from largest one
4458  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4459 
4460  const local_ordinal_type subpartidx = packptr_sub(packidx);
4461  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4462  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4463  const local_ordinal_type partidx = subpartidx % n_parts;
4464 
4465  const local_ordinal_type npacks = packptr_sub(packidx + 1) - subpartidx;
4466  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4467  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4468  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4469  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4470 
4471  //(void) i0;
4472  //(void) nrows;
4473  (void)npacks;
4474 
4475  internal_vector_scratch_type_3d_view
4476  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4477 
4478  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4479  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4480  });
4481  }
4482 
4483  template <int B>
4484  KOKKOS_INLINE_FUNCTION void
4485  operator()(const SingleVectorApplyCTag<B> &, const member_type &member) const {
4486  // btdm is packed and sorted from largest one
4487  // const local_ordinal_type packidx = packindices_schur(member.league_rank());
4488  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4489 
4490  const local_ordinal_type subpartidx = packptr_sub(packidx);
4491  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4492  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4493  const local_ordinal_type partidx = subpartidx % n_parts;
4494  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4495 
4496  // const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4497  const local_ordinal_type i0 = pack_td_ptr(partidx, local_subpartidx);
4498  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4499  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4500 
4501  internal_vector_scratch_type_3d_view
4502  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4503 
4504  // Compute v_2 = v_2 - C v_1
4505 
4506  const local_ordinal_type local_subpartidx_schur = (local_subpartidx - 1) / 2;
4507  const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx, local_subpartidx_schur) : pack_td_ptr_schur(partidx, local_subpartidx_schur) + 1;
4508  const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0 + 2 : i0 + 2;
4509 
4510  (void)i0_schur;
4511  (void)i0_offset;
4512 
4513  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4514 
4515  const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx) - 2 : 0;
4516  const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx + 1) + 1;
4517 
4518  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4519 
4520  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4521  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4522 
4523  if (local_subpartidx == 0) {
4524  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4525  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), 0, v);
4526  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4527  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4528 
4529  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4530  member,
4531  blocksize, blocksize,
4532  -one,
4533  C.data(), C.stride_0(), C.stride_1(),
4534  v_1.data(), v_1.stride_0(),
4535  one,
4536  v_2.data(), v_2.stride_0());
4537  });
4538  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4539  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4540  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4541  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4542  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4543 
4544  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4545  member,
4546  blocksize, blocksize,
4547  -one,
4548  C.data(), C.stride_0(), C.stride_1(),
4549  v_1.data(), v_1.stride_0(),
4550  one,
4551  v_2.data(), v_2.stride_0());
4552  });
4553  } else {
4554  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4555  {
4556  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + nrows - 1, Kokkos::ALL(), 0, v);
4557  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4558  auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4559 
4560  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4561  member,
4562  blocksize, blocksize,
4563  -one,
4564  C.data(), C.stride_0(), C.stride_1(),
4565  v_1.data(), v_1.stride_0(),
4566  one,
4567  v_2.data(), v_2.stride_0());
4568  }
4569  {
4570  auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4571  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4572  auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4573 
4574  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4575  member,
4576  blocksize, blocksize,
4577  -one,
4578  C.data(), C.stride_0(), C.stride_1(),
4579  v_1.data(), v_1.stride_0(),
4580  one,
4581  v_2.data(), v_2.stride_0());
4582  }
4583  });
4584  }
4585  }
4586 
4587  template <int B>
4588  KOKKOS_INLINE_FUNCTION void
4589  operator()(const SingleVectorSchurTag<B> &, const member_type &member) const {
4590  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4591 
4592  const local_ordinal_type partidx = packptr_sub(packidx);
4593 
4594  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4595 
4596  const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx, 0);
4597  const local_ordinal_type nrows = 2 * (n_subparts_per_part - 1);
4598 
4599  const local_ordinal_type r0_schur = nrows * member.league_rank();
4600 
4601  internal_vector_scratch_type_3d_view
4602  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4603 
4604  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4605  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4606  for (local_ordinal_type i = 0; i < 2; ++i) {
4607  copy3DView<local_ordinal_type>(member,
4608  Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4609  Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4610  }
4611  }
4612 
4613  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4614  solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view>(member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4615  });
4616 
4617  for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part - 1; ++schur_sub_part) {
4618  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, 2 * schur_sub_part + 1);
4619  for (local_ordinal_type i = 0; i < 2; ++i) {
4620  copy3DView<local_ordinal_type>(member,
4621  Kokkos::subview(X_internal_vector_values, r0 + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4622  Kokkos::subview(X_internal_vector_values_schur, r0_schur + 2 * schur_sub_part + i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4623  }
4624  }
4625  }
4626 
4627  template <int B>
4628  KOKKOS_INLINE_FUNCTION void
4629  operator()(const SingleVectorApplyETag<B> &, const member_type &member) const {
4630  const local_ordinal_type packidx = packindices_sub(member.league_rank());
4631 
4632  const local_ordinal_type subpartidx = packptr_sub(packidx);
4633  const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4634  const local_ordinal_type local_subpartidx = subpartidx / n_parts;
4635  const local_ordinal_type partidx = subpartidx % n_parts;
4636  const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4637 
4638  const local_ordinal_type r0 = part2packrowidx0_sub(partidx, local_subpartidx);
4639  const local_ordinal_type nrows = partptr_sub(subpartidx, 1) - partptr_sub(subpartidx, 0);
4640 
4641  internal_vector_scratch_type_3d_view
4642  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4643 
4644  // Compute v_2 = v_2 - C v_1
4645 
4646  const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4647 
4648  typedef SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space> default_mode_and_algo_type;
4649 
4650  typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4651  typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4652 
4653  if (local_subpartidx == 0) {
4654  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4655  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4656 
4657  for (local_ordinal_type row = 0; row < nrows; ++row) {
4658  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4659  auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4660 
4661  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4662  member,
4663  blocksize, blocksize,
4664  -one,
4665  E.data(), E.stride_0(), E.stride_1(),
4666  v_2.data(), v_2.stride_0(),
4667  one,
4668  v_1.data(), v_1.stride_0());
4669  }
4670  });
4671  } else if (local_subpartidx == (local_ordinal_type)part2packrowidx0_sub.extent(1) - 2) {
4672  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4673  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4674 
4675  for (local_ordinal_type row = 0; row < nrows; ++row) {
4676  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4677  auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4678 
4679  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4680  member,
4681  blocksize, blocksize,
4682  -one,
4683  E.data(), E.stride_0(), E.stride_1(),
4684  v_2.data(), v_2.stride_0(),
4685  one,
4686  v_1.data(), v_1.stride_0());
4687  }
4688  });
4689  } else {
4690  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4691  {
4692  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 + nrows, Kokkos::ALL(), 0, v);
4693 
4694  for (local_ordinal_type row = 0; row < nrows; ++row) {
4695  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4696  auto E = Kokkos::subview(e_internal_vector_values, 0, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4697 
4698  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4699  member,
4700  blocksize, blocksize,
4701  -one,
4702  E.data(), E.stride_0(), E.stride_1(),
4703  v_2.data(), v_2.stride_0(),
4704  one,
4705  v_1.data(), v_1.stride_0());
4706  }
4707  }
4708  {
4709  auto v_2 = Kokkos::subview(X_internal_vector_values, r0 - 1, Kokkos::ALL(), 0, v);
4710 
4711  for (local_ordinal_type row = 0; row < nrows; ++row) {
4712  auto v_1 = Kokkos::subview(X_internal_vector_values, r0 + row, Kokkos::ALL(), 0, v);
4713  auto E = Kokkos::subview(e_internal_vector_values, 1, r0 + row, Kokkos::ALL(), Kokkos::ALL(), v);
4714 
4715  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(default_mode_type, default_algo_type,
4716  member,
4717  blocksize, blocksize,
4718  -one,
4719  E.data(), E.stride_0(), E.stride_1(),
4720  v_2.data(), v_2.stride_0(),
4721  one,
4722  v_1.data(), v_1.stride_0());
4723  }
4724  }
4725  });
4726  }
4727  }
4728 
4729  template <int B>
4730  KOKKOS_INLINE_FUNCTION void
4731  operator()(const SingleVectorCopyToFlatTag<B> &, const member_type &member) const {
4732  const local_ordinal_type packidx = member.league_rank();
4733  const local_ordinal_type partidx = packptr(packidx);
4734  const local_ordinal_type npacks = packptr(packidx + 1) - partidx;
4735  const local_ordinal_type pri0 = part2packrowidx0(partidx);
4736  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4737  const local_ordinal_type num_vectors = 1;
4738 
4739  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const int &v) {
4740  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4741  });
4742  }
4743 
4744  template <int B>
4745  KOKKOS_INLINE_FUNCTION void
4746  operator()(const SingleZeroingTag<B> &, const member_type &member) const {
4747  Kokkos::single(Kokkos::PerTeam(member), [&]() {
4748  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4749  });
4750  }
4751 
4752  void run(const impl_scalar_type_2d_view_tpetra &Y,
4753  const impl_scalar_type_1d_view &Z) {
4754  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4755  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags", SolveTridiags);
4756 
4758  this->Y_scalar_multivector = Y;
4759  this->Z_scalar_vector = Z;
4760 
4761  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4762  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4763 
4764  const local_ordinal_type team_size =
4765  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4766  recommended_team_size(blocksize, vector_length, internal_vector_length);
4767  const int per_team_scratch = internal_vector_scratch_type_3d_view ::shmem_size(blocksize, num_vectors, vector_loop_size);
4768 
4769 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4770 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4771  if (num_vectors == 1) { \
4772  const Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4773  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4774  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4775  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)), *this); \
4776  } else { \
4777  const Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4778  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4779  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4780  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)), *this); \
4781  } \
4782  break
4783 #else
4784 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4785  if (num_vectors == 1) { \
4786  if (packindices_schur.extent(1) <= 0) { \
4787  Kokkos::TeamPolicy<execution_space, SingleVectorTag<B>> \
4788  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4789  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4790  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4791  policy, *this); \
4792  } else { \
4793  { \
4794  Kokkos::TeamPolicy<execution_space, SingleZeroingTag<B>> \
4795  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4796  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4797  policy, *this); \
4798  } \
4799  { \
4800  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4801  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4802  Kokkos::TeamPolicy<execution_space, SingleVectorSubLineTag<B>> \
4803  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4804  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4805  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4806  policy, *this); \
4807  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4808  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4809  } \
4810  { \
4811  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4812  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4813  Kokkos::TeamPolicy<execution_space, SingleVectorApplyCTag<B>> \
4814  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4815  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4816  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4817  policy, *this); \
4818  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4819  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4820  } \
4821  { \
4822  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4823  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4824  Kokkos::TeamPolicy<execution_space, SingleVectorSchurTag<B>> \
4825  policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4826  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4827  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4828  policy, *this); \
4829  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4830  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4831  } \
4832  { \
4833  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4834  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4835  Kokkos::TeamPolicy<execution_space, SingleVectorApplyETag<B>> \
4836  policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4837  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4838  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVector>", \
4839  policy, *this); \
4840  write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4841  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4842  } \
4843  { \
4844  Kokkos::TeamPolicy<execution_space, SingleVectorCopyToFlatTag<B>> \
4845  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4846  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4847  policy, *this); \
4848  } \
4849  } \
4850  } else { \
4851  Kokkos::TeamPolicy<execution_space, MultiVectorTag<B>> \
4852  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4853  policy.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); \
4854  Kokkos::parallel_for("SolveTridiags::TeamPolicy::run<MultiVector>", \
4855  policy, *this); \
4856  } \
4857  break
4858 #endif
4859  switch (blocksize) {
4860  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(3);
4861  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(5);
4862  case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(6);
4863  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(7);
4864  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4865  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4866  case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4867  case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4868  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4869  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4870  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4871  case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4872  default: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(0);
4873  }
4874 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4875 
4876  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4877  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4878  }
4879 };
4880 
4884 template <typename MatrixType>
4885 int applyInverseJacobi( // importer
4886  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
4887  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
4888  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4889  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
4890  const bool overlap_communication_and_computation,
4891  // tpetra interface
4892  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
4893  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
4894  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
4895  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
4896  // local object interface
4897  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
4898  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
4899  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
4900  /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
4901  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
4902  // preconditioner parameters
4903  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
4904  /* */ bool is_y_zero,
4905  const int max_num_sweeps,
4906  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4907  const int check_tol_every) {
4908  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
4909 
4910  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4911  using node_memory_space = typename impl_type::node_memory_space;
4912  using local_ordinal_type = typename impl_type::local_ordinal_type;
4913  using size_type = typename impl_type::size_type;
4914  using impl_scalar_type = typename impl_type::impl_scalar_type;
4915  using magnitude_type = typename impl_type::magnitude_type;
4916  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4917  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
4918  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4919  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
4920 
4921  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4922 
4923  // either tpetra importer or async importer must be active
4924  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
4925  "Neither Tpetra importer nor Async importer is null.");
4926  // max number of sweeps should be positive number
4927  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
4928  "Maximum number of sweeps must be >= 1.");
4929 
4930  // const parameters
4931  const bool is_seq_method_requested = !tpetra_importer.is_null();
4932  const bool is_async_importer_active = !async_importer.is_null();
4933  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4934  const magnitude_type tolerance = tol * tol;
4935  const local_ordinal_type blocksize = btdm.values.extent(1);
4936  const local_ordinal_type num_vectors = Y.getNumVectors();
4937  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4938 
4939  const impl_scalar_type zero(0.0);
4940 
4941  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4942  "The seq method for applyInverseJacobi, "
4943  << "which in any case is for developer use only, "
4944  << "does not support norm-based termination.");
4945  const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4946  Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4947  TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4948  std::invalid_argument,
4949  "The seq method for applyInverseJacobi, "
4950  << "which in any case is for developer use only, "
4951  << "only supports memory spaces accessible from host.");
4952 
4953  // if workspace is needed more, resize it
4954  const size_type work_span_required = num_blockrows * num_vectors * blocksize;
4955  if (work.span() < work_span_required)
4956  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
4957 
4958  // construct W
4959  const local_ordinal_type W_size = interf.packptr.extent(0) - 1;
4960  if (local_ordinal_type(W.extent(0)) < W_size)
4961  W = impl_scalar_type_1d_view("W", W_size);
4962 
4963  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4964  {
4965  if (is_seq_method_requested) {
4966  if (Z.getNumVectors() != Y.getNumVectors())
4967  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
4968  } else {
4969  if (is_async_importer_active) {
4970  // create comm data buffer and keep it here
4971  async_importer->createDataBuffer(num_vectors);
4972  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4973  }
4974  }
4975  }
4976 
4977  // wrap the workspace with 3d view
4978  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4979  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4980  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4981  const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4982  if (is_y_zero) Kokkos::deep_copy(YY, zero);
4983 
4984  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
4985  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4986  damping_factor, is_norm_manager_active);
4987 
4988  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4989 
4990  auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
4991  auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
4992 
4993  bool hasBlockCrsMatrix = !A_bcrs.is_null();
4994 
4995  // This is OK here to use the graph of the A_crs matrix and a block size of 1
4996  const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
4997 
4998  BlockHelperDetails::ComputeResidualVector<MatrixType>
4999  compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5000  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5001  hasBlockCrsMatrix);
5002 
5003  // norm manager workspace resize
5004  if (is_norm_manager_active)
5005  norm_manager.setCheckFrequency(check_tol_every);
5006 
5007  // iterate
5008  int sweep = 0;
5009  for (; sweep < max_num_sweeps; ++sweep) {
5010  {
5011  if (is_y_zero) {
5012  // pmv := x(lclrow)
5013  multivector_converter.run(XX);
5014  } else {
5015  if (is_seq_method_requested) {
5016  // SEQ METHOD IS TESTING ONLY
5017 
5018  // y := x - R y
5019  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5020  compute_residual_vector.run(YY, XX, ZZ);
5021 
5022  // pmv := y(lclrow).
5023  multivector_converter.run(YY);
5024  } else {
5025  // fused y := x - R y and pmv := y(lclrow);
5026  // real use case does not use overlap comp and comm
5027  if (overlap_communication_and_computation || !is_async_importer_active) {
5028  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5029  // OverlapTag, compute_owned = true
5030  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
5031  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5032  if (is_async_importer_active) async_importer->cancel();
5033  break;
5034  }
5035  if (is_async_importer_active) {
5036  async_importer->syncRecv();
5037  // OverlapTag, compute_owned = false
5038  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
5039  }
5040  } else {
5041  if (is_async_importer_active)
5042  async_importer->syncExchange(YY);
5043  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5044  // AsyncTag
5045  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5046  }
5047  }
5048  }
5049  }
5050 
5051  // pmv := inv(D) pmv.
5052  {
5053  solve_tridiags.run(YY, W);
5054  }
5055  {
5056  if (is_norm_manager_active) {
5057  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
5058  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5059  if (sweep + 1 == max_num_sweeps) {
5060  norm_manager.ireduce(sweep, true);
5061  norm_manager.checkDone(sweep + 1, tolerance, true);
5062  } else {
5063  norm_manager.ireduce(sweep);
5064  }
5065  }
5066  }
5067  is_y_zero = false;
5068  }
5069 
5070  // sqrt the norms for the caller's use.
5071  if (is_norm_manager_active) norm_manager.finalize();
5072 
5073  return sweep;
5074 }
5075 
5076 // Implementation of fused block Jacobi for a specific block size,
5077 // or (if B == 0) for a general block size.
5078 template <typename MatrixType, int B>
5079 int applyFusedBlockJacobi_Impl(
5080  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5081  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5082  const bool overlap_communication_and_computation,
5083  // tpetra interface
5084  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5085  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5086  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5087  // local object interface
5088  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5089  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5090  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5091  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5092  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5093  // preconditioner parameters
5094  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5095  /* */ bool is_y_zero,
5096  const int max_num_sweeps,
5097  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5098  const int check_tol_every) {
5099  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5100  using local_ordinal_type = typename impl_type::local_ordinal_type;
5101  using size_type = typename impl_type::size_type;
5102  using magnitude_type = typename impl_type::magnitude_type;
5103  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
5104  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
5105 
5106  // the tpetra importer and async importer can't both be active
5107  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
5108  "Neither Tpetra importer nor Async importer is null.");
5109  // max number of sweeps should be positive number
5110  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
5111  "Maximum number of sweeps must be >= 1.");
5112 
5113  // const parameters
5114  const bool is_async_importer_active = !async_importer.is_null();
5115  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5116  const magnitude_type tolerance = tol * tol;
5117  const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5118  const local_ordinal_type num_vectors = Y.getNumVectors();
5119  const local_ordinal_type num_blockrows = interf.nparts;
5120 
5121  typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5122  {
5123  if (is_async_importer_active) {
5124  // create comm data buffer and keep it here
5125  async_importer->createDataBuffer(num_vectors);
5126  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5127  }
5128  }
5129 
5130  const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5131  const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5132 
5133  const bool two_pass_residual =
5134  overlap_communication_and_computation && is_async_importer_active;
5135 
5136  // Calculate the required work size and reallocate it if not already big enough.
5137  // Check that our assumptions about YY dimension are correct.
5139  size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5140  "Local LHS vector (YY) has total size " << YY.extent(0) << "x" << YY.extent(1) << " = " << YY.extent(0) * YY.extent(1) << ",\n"
5141  << "but expected " << num_blockrows << "x" << blocksize << "x" << num_vectors << " = " << size_t(num_blockrows) * blocksize * num_vectors << '\n');
5142  size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5143  if (work.extent(0) < work_required) {
5144  work = impl_scalar_type_1d_view(do_not_initialize_tag("flat workspace 1d view"), work_required);
5145  }
5146 
5147  Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5148 
5149  // construct W
5150  if (W.extent(0) != size_t(num_blockrows))
5151  W = impl_scalar_type_1d_view(do_not_initialize_tag("W"), num_blockrows);
5152 
5153  // Create the required functors upfront (this is inexpensive - all shallow copies)
5154  BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5155  functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5156  BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5157  functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5158  BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5159  functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5160 
5161  // norm manager workspace resize
5162  if (is_norm_manager_active)
5163  norm_manager.setCheckFrequency(check_tol_every);
5164 
5165  // For double-buffering.
5166  // yy_buffers[current_y] has the current iterate of y.
5167  // yy_buffers[1-current_y] has the next iterate of y.
5168  Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5169  int current_y = 0;
5170 
5171  // iterate
5172  int sweep = 0;
5173  for (; sweep < max_num_sweeps; ++sweep) {
5174  if (is_y_zero) {
5175  // If y is initially zero, then we are just computing y := damping_factor * Dinv * x
5176  functor_solve_only.run(XX, y_buffers[1 - current_y]);
5177  } else {
5178  // real use case does not use overlap comp and comm
5179  if (overlap_communication_and_computation || !is_async_importer_active) {
5180  if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5181  if (two_pass_residual) {
5182  // Pass 1 computes owned residual and stores into new y buffer,
5183  // but doesn't apply Dinv or produce a norm yet
5184  functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1 - current_y]);
5185  } else {
5186  // This case happens if running with single rank.
5187  // There are no remote columns, so residual and solve can happen in one step.
5188  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5189  }
5190  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5191  if (is_async_importer_active) async_importer->cancel();
5192  break;
5193  }
5194  if (is_async_importer_active) {
5195  async_importer->syncRecv();
5196  // Stage 2 finishes computing the residual, then applies Dinv and computes norm.
5197  functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5198  }
5199  } else {
5200  if (is_async_importer_active)
5201  async_importer->syncExchange(y_buffers[current_y]);
5202  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5203  // Full residual, Dinv apply, and norm in one kernel
5204  functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1 - current_y]);
5205  }
5206  }
5207 
5208  // Compute global norm.
5209  if (is_norm_manager_active) {
5210  BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5211  if (sweep + 1 == max_num_sweeps) {
5212  norm_manager.ireduce(sweep, true);
5213  norm_manager.checkDone(sweep + 1, tolerance, true);
5214  } else {
5215  norm_manager.ireduce(sweep);
5216  }
5217  }
5218  is_y_zero = false;
5219  // flip y buffers for next iteration, or termination if we reached max_num_sweeps.
5220  current_y = 1 - current_y;
5221  }
5222  if (current_y == 1) {
5223  // We finished iterating with y in the double buffer, so copy it to the user's vector.
5224  Kokkos::deep_copy(YY, y_doublebuf);
5225  }
5226 
5227  // sqrt the norms for the caller's use.
5228  if (is_norm_manager_active) norm_manager.finalize();
5229  return sweep;
5230 }
5231 
5235 template <typename MatrixType>
5237  const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5238  const Teuchos::RCP<AsyncableImport<MatrixType>> &async_importer,
5239  const bool overlap_communication_and_computation,
5240  // tpetra interface
5241  const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
5242  /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
5243  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
5244  // local object interface
5245  const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
5246  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
5247  const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
5248  /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
5249  /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
5250  // preconditioner parameters
5251  const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
5252  /* */ bool is_y_zero,
5253  const int max_num_sweeps,
5254  const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5255  const int check_tol_every) {
5256  IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5257  int blocksize = btdm.d_inv.extent(1);
5258  int sweep = 0;
5259 #define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) \
5260  { \
5261  sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5262  tpetra_importer, async_importer, overlap_communication_and_computation, \
5263  X, Y, W, interf, btdm, amd, work, \
5264  norm_manager, damping_factor, is_y_zero, \
5265  max_num_sweeps, tol, check_tol_every); \
5266  } \
5267  break
5268  switch (blocksize) {
5269  case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(3);
5270  case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(5);
5271  case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(7);
5272  case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(9);
5273  case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5274  case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5275  case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5276  case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5277  case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5278  default: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(0);
5279  }
5280 #undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5281 
5282  return sweep;
5283 }
5284 
5285 template <typename MatrixType>
5286 struct ImplObject {
5287  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
5288  using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5289  using block_tridiags_type = BlockTridiags<MatrixType>;
5290  using amd_type = BlockHelperDetails::AmD<MatrixType>;
5291  using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
5292  using async_import_type = AsyncableImport<MatrixType>;
5293 
5294  // distructed objects
5298  Teuchos::RCP<async_import_type> async_importer;
5299  bool overlap_communication_and_computation;
5300 
5301  // copy of Y (mutable to penentrate const)
5302  mutable typename impl_type::tpetra_multivector_type Z;
5303  mutable typename impl_type::impl_scalar_type_1d_view W;
5304 
5305  // local objects
5306  part_interface_type part_interface;
5307  block_tridiags_type block_tridiags; // D
5308  amd_type a_minus_d; // R = A - D
5309 
5310  // whether to use fused block Jacobi path
5311  bool use_fused_jacobi;
5312 
5313  // vector workspace is used for general block tridi case
5314  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace (1D view of vector)
5315  // scalar workspace is used for fused block jacobi case
5316  mutable typename impl_type::impl_scalar_type_1d_view work_flat; // right hand side workspace (1D view of scalar)
5317  mutable norm_manager_type norm_manager;
5318 };
5319 
5320 } // namespace BlockTriDiContainerDetails
5321 
5322 } // namespace Ifpack2
5323 
5324 #endif
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:141
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5236
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3645
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1865
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_type size() const
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type >> &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1044
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:889
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1613
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:346
Definition: Ifpack2_BlockHelper.hpp:377
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:97
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
T * getRawPtr() const
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:283
Definition: Ifpack2_BlockHelper.hpp:211
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:165
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:236
Definition: Ifpack2_BlockHelper.hpp:270
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType >> &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:4885
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1545
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3695