Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockComputeResidualAndSolve.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
12 
13 #include "Ifpack2_BlockHelper.hpp"
14 #include "Ifpack2_BlockComputeResidualVector.hpp"
15 
16 namespace Ifpack2::BlockHelperDetails {
17 
18 template <typename ExecSpace, typename DiagOffsets, typename Rowptrs,
19  typename Entries>
20 DiagOffsets findDiagOffsets(const Rowptrs& rowptrs, const Entries& entries,
21  size_t nrows, int blocksize) {
22  DiagOffsets diag_offsets(do_not_initialize_tag("btdm.diag_offsets"), nrows);
23  int err = 0;
24  Kokkos::parallel_reduce(
25  Kokkos::RangePolicy<ExecSpace>(0, nrows),
26  KOKKOS_LAMBDA(size_t row, int& err) {
27  auto rowBegin = rowptrs(row);
28  auto rowEnd = rowptrs(row + 1);
29  for (size_t j = rowBegin; j < rowEnd; j++) {
30  if (size_t(entries(j)) == row) {
31  diag_offsets(row) = j * blocksize * blocksize;
32  return;
33  }
34  }
35  err++;
36  },
37  err);
39  err, "Ifpack2 BTD: at least one row has no diagonal entry");
40  return diag_offsets;
41 }
42 
43 template <typename MatrixType, int B>
44 struct ComputeResidualAndSolve_1Pass {
45  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
46  using node_device_type = typename impl_type::node_device_type;
47  using execution_space = typename impl_type::execution_space;
48  using memory_space = typename impl_type::memory_space;
49 
50  using local_ordinal_type = typename impl_type::local_ordinal_type;
51  using size_type = typename impl_type::size_type;
52  using impl_scalar_type = typename impl_type::impl_scalar_type;
53  using magnitude_type = typename impl_type::magnitude_type;
55  using local_ordinal_type_1d_view =
56  typename impl_type::local_ordinal_type_1d_view;
57  using size_type_1d_view = typename impl_type::size_type_1d_view;
58  using tpetra_block_access_view_type =
59  typename impl_type::tpetra_block_access_view_type; // block crs (layout
60  // right)
61  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
62  using impl_scalar_type_2d_view_tpetra =
63  typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
64  // (layout left)
65  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
66  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
67  using i64_3d_view = typename impl_type::i64_3d_view;
68 
70  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
71 
72  // enum for max blocksize and vector length
73  enum : int { max_blocksize = 32 };
74 
75  private:
76  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
77  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
78  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
79  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
80 
81  // AmD information
82  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
83 
84  // blocksize
85  const local_ordinal_type blocksize_requested;
86 
87  // block offsets
88  const ConstUnmanaged<i64_3d_view> A_x_offsets;
89  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
90 
91  // diagonal block inverses
92  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
93 
94  // squared update norms
95  const Unmanaged<impl_scalar_type_1d_view> W;
96 
97  impl_scalar_type damping_factor;
98 
99  public:
100  ComputeResidualAndSolve_1Pass(const AmD<MatrixType>& amd,
101  const btdm_scalar_type_3d_view& d_inv_,
102  const impl_scalar_type_1d_view& W_,
103  const local_ordinal_type& blocksize_requested_,
104  const impl_scalar_type& damping_factor_)
105  : tpetra_values(amd.tpetra_values)
106  , blocksize_requested(blocksize_requested_)
107  , A_x_offsets(amd.A_x_offsets)
108  , A_x_offsets_remote(amd.A_x_offsets_remote)
109  , d_inv(d_inv_)
110  , W(W_)
111  , damping_factor(damping_factor_) {}
112 
113  KOKKOS_INLINE_FUNCTION
114  void operator()(const member_type& member) const {
115  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
116  const local_ordinal_type rowidx = member.league_rank();
117  const local_ordinal_type row = rowidx * blocksize;
118  const local_ordinal_type num_vectors = b.extent(1);
119  const local_ordinal_type num_local_rows = d_inv.extent(0);
120 
121  const impl_scalar_type* xx;
122  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
123  NULL, blocksize, blocksize);
124 
125  // Get shared allocation for a local copy of x, residual, and A
126  impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
127  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
128  impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
129  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
130  impl_scalar_type* local_x =
131  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
132  blocksize * sizeof(impl_scalar_type)));
133 
134  magnitude_type norm = 0;
135  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
136  if (col) member.team_barrier();
137  // y -= Rx
138  // Initialize accumulation arrays
139  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
140  [&](const local_ordinal_type& i) {
141  local_Dinv_residual[i] = 0;
142  local_residual[i] = b(row + i, col);
143  });
144  member.team_barrier();
145 
146  int numEntries = A_x_offsets.extent(2);
147 
148  Kokkos::parallel_for(
149  Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
150  int64_t A_offset = A_x_offsets(rowidx, 0, k);
151  int64_t x_offset = A_x_offsets(rowidx, 1, k);
152  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
153  A_block_cst.assign_data(tpetra_values.data() + A_offset);
154  // Pull x into local memory
155  int64_t remote_cutoff = blocksize * num_local_rows;
156  if (x_offset >= remote_cutoff)
157  xx = &x_remote(x_offset - remote_cutoff, col);
158  else
159  xx = &x(x_offset, col);
160 
161  Kokkos::parallel_for(
162  Kokkos::ThreadVectorRange(member, blocksize),
163  [&](const local_ordinal_type& i) { local_x[i] = xx[i]; });
164 
165  // matvec on block: local_residual -= A_block_cst * local_x
166  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
167  [&](const int k0) {
168  impl_scalar_type val = 0;
169  for (int k1 = 0; k1 < blocksize; k1++)
170  val += A_block_cst(k0, k1) * local_x[k1];
171  Kokkos::atomic_add(local_residual + k0, -val);
172  });
173  }
174  });
175  member.team_barrier();
176  // Compute local_Dinv_residual = D^-1 * local_residual
177  Kokkos::parallel_for(
178  Kokkos::TeamThreadRange(member, blocksize),
179  [&](const local_ordinal_type& k0) {
180  Kokkos::parallel_reduce(
181  Kokkos::ThreadVectorRange(member, blocksize),
182  [&](const local_ordinal_type& k1, impl_scalar_type& update) {
183  update += d_inv(rowidx, k0, k1) * local_residual[k1];
184  },
185  local_Dinv_residual[k0]);
186  });
187  member.team_barrier();
188  // local_Dinv_residual is fully computed. Now compute the
189  // squared y update norm and update y (using damping factor).
190  magnitude_type colNorm;
191  Kokkos::parallel_reduce(
192  Kokkos::TeamVectorRange(member, blocksize),
193  [&](const local_ordinal_type& k, magnitude_type& update) {
194  // Compute the change in y (assuming damping_factor == 1) for this
195  // entry.
196  impl_scalar_type old_y = x(row + k, col);
197  impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
198  if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
199  magnitude_type ydiff =
200  Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
201  update += ydiff * ydiff;
202  } else {
203  update += y_update * y_update;
204  }
205  y(row + k, col) = old_y + damping_factor * y_update;
206  },
207  colNorm);
208  norm += colNorm;
209  }
210  Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
211  }
212 
213  // Launch SinglePass version (owned + nonowned residual, plus Dinv in a single
214  // kernel)
215  void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
216  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
217  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
218  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
219  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
220  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
221  "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
222  ComputeResidualAndSolve0, execution_space);
223 
224  y = y_;
225  b = b_;
226  x = x_;
227  x_remote = x_remote_;
228 
229  const local_ordinal_type blocksize = blocksize_requested;
230  const local_ordinal_type nrows = d_inv.extent(0);
231 
232  const local_ordinal_type team_size = 8;
233  const local_ordinal_type vector_size = 8;
234  // team: local_residual, local_Dinv_residual
235  const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
236  // thread: local_x
237  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
238  Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
239  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
240  Kokkos::PerThread(shmem_thread_size));
241  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::SinglePass",
242  policy, *this);
243  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
244  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
245  }
246 };
247 
248 template <typename MatrixType, int B>
249 struct ComputeResidualAndSolve_2Pass {
250  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
251  using node_device_type = typename impl_type::node_device_type;
252  using execution_space = typename impl_type::execution_space;
253  using memory_space = typename impl_type::memory_space;
254 
255  using local_ordinal_type = typename impl_type::local_ordinal_type;
256  using size_type = typename impl_type::size_type;
257  using impl_scalar_type = typename impl_type::impl_scalar_type;
258  using magnitude_type = typename impl_type::magnitude_type;
260  using local_ordinal_type_1d_view =
261  typename impl_type::local_ordinal_type_1d_view;
262  using size_type_1d_view = typename impl_type::size_type_1d_view;
263  using tpetra_block_access_view_type =
264  typename impl_type::tpetra_block_access_view_type; // block crs (layout
265  // right)
266  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
267  using impl_scalar_type_2d_view_tpetra =
268  typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
269  // (layout left)
270  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
271  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
272  using i64_3d_view = typename impl_type::i64_3d_view;
273 
275  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
276 
277  // enum for max blocksize and vector length
278  enum : int { max_blocksize = 32 };
279 
280  // Tag for computing residual with owned columns only (pass 1)
281  struct OwnedTag {};
282 
283  // Tag for finishing the residual with nonowned columns, and solving/norming
284  // (pass 2)
285  struct NonownedTag {};
286 
287  private:
288  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
289  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
290  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
291  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
292 
293  // AmD information
294  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
295 
296  // blocksize
297  const local_ordinal_type blocksize_requested;
298 
299  // block offsets
300  const ConstUnmanaged<i64_3d_view> A_x_offsets;
301  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
302 
303  // diagonal block inverses
304  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
305 
306  // squared update norms
307  const Unmanaged<impl_scalar_type_1d_view> W;
308 
309  impl_scalar_type damping_factor;
310 
311  public:
312  ComputeResidualAndSolve_2Pass(const AmD<MatrixType>& amd,
313  const btdm_scalar_type_3d_view& d_inv_,
314  const impl_scalar_type_1d_view& W_,
315  const local_ordinal_type& blocksize_requested_,
316  const impl_scalar_type& damping_factor_)
317  : tpetra_values(amd.tpetra_values)
318  , blocksize_requested(blocksize_requested_)
319  , A_x_offsets(amd.A_x_offsets)
320  , A_x_offsets_remote(amd.A_x_offsets_remote)
321  , d_inv(d_inv_)
322  , W(W_)
323  , damping_factor(damping_factor_) {}
324 
325  KOKKOS_INLINE_FUNCTION
326  void operator()(const OwnedTag, const member_type& member) const {
327  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
328  const local_ordinal_type rowidx = member.league_rank();
329  const local_ordinal_type row = rowidx * blocksize;
330  const local_ordinal_type num_vectors = b.extent(1);
331 
332  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
333  NULL, blocksize, blocksize);
334 
335  // Get shared allocation for a local copy of x, Ax, and A
336  impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
337  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
338  impl_scalar_type* local_x =
339  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
340  blocksize * sizeof(impl_scalar_type)));
341 
342  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
343  if (col) member.team_barrier();
344  // y -= Rx
345  // Initialize accumulation arrays
346  Kokkos::parallel_for(
347  Kokkos::TeamVectorRange(member, blocksize),
348  [&](const local_ordinal_type& i) { local_residual[i] = b(row + i, col); });
349  member.team_barrier();
350 
351  int numEntries = A_x_offsets.extent(2);
352 
353  Kokkos::parallel_for(
354  Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
355  int64_t A_offset = A_x_offsets(rowidx, 0, k);
356  int64_t x_offset = A_x_offsets(rowidx, 1, k);
357  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
358  A_block_cst.assign_data(tpetra_values.data() + A_offset);
359  // Pull x into local memory
360  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
361  [&](const local_ordinal_type& i) {
362  local_x[i] = x(x_offset + i, col);
363  });
364 
365  // MatVec op Ax += A*x
366  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
367  [&](const local_ordinal_type& k0) {
368  impl_scalar_type val = 0;
369  for (int k1 = 0; k1 < blocksize; k1++)
370  val += A_block_cst(k0, k1) * local_x[k1];
371  Kokkos::atomic_add(local_residual + k0, -val);
372  });
373  }
374  });
375  member.team_barrier();
376  // Write back the partial residual to y
377  if (member.team_rank() == 0) {
378  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
379  [&](const local_ordinal_type& k) {
380  y(row + k, col) = local_residual[k];
381  });
382  }
383  }
384  }
385 
386  KOKKOS_INLINE_FUNCTION
387  void operator()(const NonownedTag, const member_type& member) const {
388  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
389  const local_ordinal_type rowidx = member.league_rank();
390  const local_ordinal_type row = rowidx * blocksize;
391  const local_ordinal_type num_vectors = b.extent(1);
392 
393  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
394  NULL, blocksize, blocksize);
395 
396  // Get shared allocation for a local copy of x, Ax, and A
397  impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
398  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
399  impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
400  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
401  impl_scalar_type* local_x =
402  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
403  blocksize * sizeof(impl_scalar_type)));
404 
405  magnitude_type norm = 0;
406  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
407  if (col) member.team_barrier();
408  // y -= Rx
409  // Initialize accumulation arrays.
410  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
411  [&](const local_ordinal_type& i) {
412  local_Dinv_residual[i] = 0;
413  local_residual[i] = y(row + i, col);
414  });
415  member.team_barrier();
416 
417  int numEntries = A_x_offsets_remote.extent(2);
418 
419  Kokkos::parallel_for(
420  Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
421  int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
422  int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
423  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
424  A_block_cst.assign_data(tpetra_values.data() + A_offset);
425  // Pull x into local memory
426  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
427  [&](const local_ordinal_type& i) {
428  local_x[i] = x_remote(x_offset + i, col);
429  });
430 
431  // matvec on block: local_residual -= A_block_cst * local_x
432  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
433  [&](const int k0) {
434  impl_scalar_type val = 0;
435  for (int k1 = 0; k1 < blocksize; k1++)
436  val += A_block_cst(k0, k1) * local_x[k1];
437  Kokkos::atomic_add(local_residual + k0, -val);
438  });
439  }
440  });
441  member.team_barrier();
442  // Compute local_Dinv_residual = D^-1 * local_residual
443  Kokkos::parallel_for(
444  Kokkos::TeamThreadRange(member, blocksize),
445  [&](const local_ordinal_type& k0) {
446  Kokkos::parallel_reduce(
447  Kokkos::ThreadVectorRange(member, blocksize),
448  [&](const local_ordinal_type& k1, impl_scalar_type& update) {
449  update += d_inv(rowidx, k0, k1) * local_residual[k1];
450  },
451  local_Dinv_residual[k0]);
452  });
453  member.team_barrier();
454  // local_Dinv_residual is fully computed. Now compute the
455  // squared y update norm and update y (using damping factor).
456  magnitude_type colNorm;
457  Kokkos::parallel_reduce(
458  Kokkos::TeamVectorRange(member, blocksize),
459  [&](const local_ordinal_type& k, magnitude_type& update) {
460  // Compute the change in y (assuming damping_factor == 1) for this
461  // entry.
462  impl_scalar_type old_y = x(row + k, col);
463  impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
464  if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
465  magnitude_type ydiff =
466  Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
467  update += ydiff * ydiff;
468  } else {
469  update += y_update * y_update;
470  }
471  y(row + k, col) = old_y + damping_factor * y_update;
472  },
473  colNorm);
474  norm += colNorm;
475  }
476  Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
477  }
478 
479  // Launch pass 1 of the 2-pass version.
480  // This computes just the owned part of residual and writes that back to y.
481  void run_pass1(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
482  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
483  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
484  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
485  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
486  "BlockTriDi::ComputeResidualAndSolve::RunPass1",
487  ComputeResidualAndSolve0, execution_space);
488 
489  b = b_;
490  x = x_;
491  y = y_;
492 
493  const local_ordinal_type blocksize = blocksize_requested;
494  const local_ordinal_type nrows = d_inv.extent(0);
495 
496  const local_ordinal_type team_size = 8;
497  const local_ordinal_type vector_size = 8;
498  const size_t shmem_team_size = blocksize * sizeof(impl_scalar_type);
499  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
500  Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
501  vector_size);
502  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
503  Kokkos::PerThread(shmem_thread_size));
504  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
505  *this);
506  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
507  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
508  }
509 
510  // Launch pass 2 of the 2-pass version.
511  // This finishes computing residual with x_remote,
512  // and then applies Dinv and computes norm.
513  void run_pass2(
514  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
515  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
516  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
517  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
518  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
519  "BlockTriDi::ComputeResidualAndSolve::RunPass2",
520  ComputeResidualAndSolve0, execution_space);
521 
522  x = x_;
523  x_remote = x_remote_;
524  y = y_;
525 
526  const local_ordinal_type blocksize = blocksize_requested;
527  const local_ordinal_type nrows = d_inv.extent(0);
528 
529  const local_ordinal_type team_size = 8;
530  const local_ordinal_type vector_size = 8;
531  const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
532  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
533  Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
534  vector_size);
535  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
536  Kokkos::PerThread(shmem_thread_size));
537  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
538  *this);
539  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
540  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
541  }
542 };
543 
544 template <typename MatrixType, int B>
545 struct ComputeResidualAndSolve_SolveOnly {
546  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
547  using node_device_type = typename impl_type::node_device_type;
548  using execution_space = typename impl_type::execution_space;
549  using memory_space = typename impl_type::memory_space;
550 
551  using local_ordinal_type = typename impl_type::local_ordinal_type;
552  using size_type = typename impl_type::size_type;
553  using impl_scalar_type = typename impl_type::impl_scalar_type;
554  using magnitude_type = typename impl_type::magnitude_type;
556  using local_ordinal_type_1d_view =
557  typename impl_type::local_ordinal_type_1d_view;
558  using size_type_1d_view = typename impl_type::size_type_1d_view;
559  using tpetra_block_access_view_type =
560  typename impl_type::tpetra_block_access_view_type; // block crs (layout
561  // right)
562  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
563  using impl_scalar_type_2d_view_tpetra =
564  typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
565  // (layout left)
566  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
567  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
568  using i64_3d_view = typename impl_type::i64_3d_view;
569 
571  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
572 
573  // enum for max blocksize and vector length
574  enum : int { max_blocksize = 32 };
575 
576  private:
577  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
578  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
579  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
580  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
581 
582  // AmD information
583  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
584 
585  // blocksize
586  const local_ordinal_type blocksize_requested;
587 
588  // block offsets
589  const ConstUnmanaged<i64_3d_view> A_x_offsets;
590  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
591 
592  // diagonal block inverses
593  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
594 
595  // squared update norms
596  const Unmanaged<impl_scalar_type_1d_view> W;
597 
598  impl_scalar_type damping_factor;
599 
600  public:
601  ComputeResidualAndSolve_SolveOnly(
602  const AmD<MatrixType>& amd, const btdm_scalar_type_3d_view& d_inv_,
603  const impl_scalar_type_1d_view& W_,
604  const local_ordinal_type& blocksize_requested_,
605  const impl_scalar_type& damping_factor_)
606  : tpetra_values(amd.tpetra_values)
607  , blocksize_requested(blocksize_requested_)
608  , A_x_offsets(amd.A_x_offsets)
609  , A_x_offsets_remote(amd.A_x_offsets_remote)
610  , d_inv(d_inv_)
611  , W(W_)
612  , damping_factor(damping_factor_) {}
613 
614  KOKKOS_INLINE_FUNCTION
615  void operator()(const member_type& member) const {
616  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
617  const local_ordinal_type rowidx =
618  member.league_rank() * member.team_size() + member.team_rank();
619  const local_ordinal_type row = rowidx * blocksize;
620  const local_ordinal_type num_vectors = b.extent(1);
621 
622  // Get shared allocation for a local copy of x, Ax, and A
623  impl_scalar_type* local_Dinv_residual =
624  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
625  blocksize * sizeof(impl_scalar_type)));
626 
627  if (rowidx >= (local_ordinal_type)d_inv.extent(0)) return;
628 
629  magnitude_type norm = 0;
630  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
631  // Compute local_Dinv_residual = D^-1 * local_residual
632  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
633  [&](const local_ordinal_type& k0) {
634  impl_scalar_type val = 0;
635  for (local_ordinal_type k1 = 0; k1 < blocksize;
636  k1++) {
637  val += d_inv(rowidx, k0, k1) * b(row + k1, col);
638  }
639  local_Dinv_residual[k0] = val;
640  });
641 
642  magnitude_type colNorm;
643  Kokkos::parallel_reduce(
644  Kokkos::ThreadVectorRange(member, blocksize),
645  [&](const local_ordinal_type& k, magnitude_type& update) {
646  // Compute the change in y (assuming damping_factor == 1) for this
647  // entry.
648  impl_scalar_type y_update = local_Dinv_residual[k];
649  if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
650  magnitude_type ydiff =
651  Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
652  update += ydiff * ydiff;
653  } else {
654  update += y_update * y_update;
655  }
656  y(row + k, col) = damping_factor * y_update;
657  },
658  colNorm);
659  norm += colNorm;
660  }
661  Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
662  }
663 
664  // ComputeResidualAndSolve_SolveOnly::run does the solve for the first
665  // iteration, when the initial guess for y is zero. This means the residual
666  // vector is just b. The kernel applies the inverse diags to b to find y, and
667  // also puts the partial squared update norms (1 per row) into W.
668  void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
669  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
670  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
671  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
672  "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
673  ComputeResidualAndSolve0, execution_space);
674 
675  y = y_;
676  b = b_;
677 
678  const local_ordinal_type blocksize = blocksize_requested;
679  const local_ordinal_type nrows = d_inv.extent(0);
680 
681  const local_ordinal_type team_size = 8;
682  const local_ordinal_type vector_size = 8;
683  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
684  Kokkos::TeamPolicy<execution_space> policy(
685  (nrows + team_size - 1) / team_size, team_size, vector_size);
686  policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
687  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::y_zero", policy,
688  *this);
689  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
690  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
691  }
692 };
693 
694 } // namespace Ifpack2::BlockHelperDetails
695 
696 #endif
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:297
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:346
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:283