Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_Details_ChebyshevKernel_def.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_DETAILS_CHEBYSHEVKERNEL_DEF_HPP
11 #define IFPACK2_DETAILS_CHEBYSHEVKERNEL_DEF_HPP
12 
13 #include "Tpetra_CrsMatrix.hpp"
14 #include "Tpetra_MultiVector.hpp"
15 #include "Tpetra_Operator.hpp"
16 #include "Tpetra_Vector.hpp"
17 #include "Tpetra_Export_decl.hpp"
18 #include "Tpetra_Import_decl.hpp"
19 #include "Kokkos_ArithTraits.hpp"
20 #include "Teuchos_Assert.hpp"
21 #include <type_traits>
22 #include "KokkosSparse_spmv_impl.hpp"
23 
24 namespace Ifpack2 {
25 namespace Details {
26 namespace Impl {
27 
32 template <class WVector,
33  class DVector,
34  class BVector,
35  class AMatrix,
36  class XVector_colMap,
37  class XVector_domMap,
38  class Scalar,
39  bool use_beta,
40  bool do_X_update>
42  static_assert(static_cast<int>(WVector::rank) == 1,
43  "WVector must be a rank 1 View.");
44  static_assert(static_cast<int>(DVector::rank) == 1,
45  "DVector must be a rank 1 View.");
46  static_assert(static_cast<int>(BVector::rank) == 1,
47  "BVector must be a rank 1 View.");
48  static_assert(static_cast<int>(XVector_colMap::rank) == 1,
49  "XVector_colMap must be a rank 1 View.");
50  static_assert(static_cast<int>(XVector_domMap::rank) == 1,
51  "XVector_domMap must be a rank 1 View.");
52 
53  using execution_space = typename AMatrix::execution_space;
54  using LO = typename AMatrix::non_const_ordinal_type;
55  using value_type = typename AMatrix::non_const_value_type;
56  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
57  using team_member = typename team_policy::member_type;
58  using ATV = Kokkos::ArithTraits<value_type>;
59 
60  const Scalar alpha;
61  WVector m_w;
62  DVector m_d;
63  BVector m_b;
64  AMatrix m_A;
65  XVector_colMap m_x_colMap;
66  XVector_domMap m_x_domMap;
67  const Scalar beta;
68 
69  const LO rows_per_team;
70 
71  ChebyshevKernelVectorFunctor(const Scalar& alpha_,
72  const WVector& m_w_,
73  const DVector& m_d_,
74  const BVector& m_b_,
75  const AMatrix& m_A_,
76  const XVector_colMap& m_x_colMap_,
77  const XVector_domMap& m_x_domMap_,
78  const Scalar& beta_,
79  const int rows_per_team_)
80  : alpha(alpha_)
81  , m_w(m_w_)
82  , m_d(m_d_)
83  , m_b(m_b_)
84  , m_A(m_A_)
85  , m_x_colMap(m_x_colMap_)
86  , m_x_domMap(m_x_domMap_)
87  , beta(beta_)
88  , rows_per_team(rows_per_team_) {
89  const size_t numRows = m_A.numRows();
90  const size_t numCols = m_A.numCols();
91 
92  TEUCHOS_ASSERT(m_w.extent(0) == m_d.extent(0));
93  TEUCHOS_ASSERT(m_w.extent(0) == m_b.extent(0));
94  TEUCHOS_ASSERT(numRows == size_t(m_w.extent(0)));
95  TEUCHOS_ASSERT(numCols <= size_t(m_x_colMap.extent(0)));
96  TEUCHOS_ASSERT(numRows <= size_t(m_x_domMap.extent(0)));
97  }
98 
99  KOKKOS_INLINE_FUNCTION
100  void operator()(const team_member& dev) const {
101  using residual_value_type = typename BVector::non_const_value_type;
102  using KAT = Kokkos::ArithTraits<residual_value_type>;
103 
104  Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team),
105  [&](const LO& loop) {
106  const LO lclRow =
107  static_cast<LO>(dev.league_rank()) * rows_per_team + loop;
108  if (lclRow >= m_A.numRows()) {
109  return;
110  }
111  const KokkosSparse::SparseRowViewConst<AMatrix> A_row = m_A.rowConst(lclRow);
112  const LO row_length = static_cast<LO>(A_row.length);
113  residual_value_type A_x = KAT::zero();
114 
115  Kokkos::parallel_reduce(
116  Kokkos::ThreadVectorRange(dev, row_length),
117  [&](const LO iEntry, residual_value_type& lsum) {
118  const auto A_val = A_row.value(iEntry);
119  lsum += A_val * m_x_colMap(A_row.colidx(iEntry));
120  },
121  A_x);
122 
123  Kokkos::single(Kokkos::PerThread(dev),
124  [&]() {
125  const auto alpha_D_res =
126  alpha * m_d(lclRow) * (m_b(lclRow) - A_x);
127  if (use_beta) {
128  m_w(lclRow) = beta * m_w(lclRow) + alpha_D_res;
129  } else {
130  m_w(lclRow) = alpha_D_res;
131  }
132  if (do_X_update)
133  m_x_domMap(lclRow) += m_w(lclRow);
134  });
135  });
136  }
137 };
138 
139 // W := alpha * D * (B - A*X) + beta * W.
140 template <class WVector,
141  class DVector,
142  class BVector,
143  class AMatrix,
144  class XVector_colMap,
145  class XVector_domMap,
146  class Scalar>
147 static void
148 chebyshev_kernel_vector(const Scalar& alpha,
149  const WVector& w,
150  const DVector& d,
151  const BVector& b,
152  const AMatrix& A,
153  const XVector_colMap& x_colMap,
154  const XVector_domMap& x_domMap,
155  const Scalar& beta,
156  const bool do_X_update) {
157  using execution_space = typename AMatrix::execution_space;
158 
159  if (A.numRows() == 0) {
160  return;
161  }
162 
163  int team_size = -1;
164  int vector_length = -1;
165  int64_t rows_per_thread = -1;
166 
167  const int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length);
168  int64_t worksets = (b.extent(0) + rows_per_team - 1) / rows_per_team;
169 
170  using Kokkos::Dynamic;
171  using Kokkos::Schedule;
172  using Kokkos::Static;
173  using Kokkos::TeamPolicy;
174  using policy_type_dynamic = TeamPolicy<execution_space, Schedule<Dynamic> >;
175  using policy_type_static = TeamPolicy<execution_space, Schedule<Static> >;
176  const char kernel_label[] = "chebyshev_kernel_vector";
177  policy_type_dynamic policyDynamic(1, 1);
178  policy_type_static policyStatic(1, 1);
179  if (team_size < 0) {
180  policyDynamic = policy_type_dynamic(worksets, Kokkos::AUTO, vector_length);
181  policyStatic = policy_type_static(worksets, Kokkos::AUTO, vector_length);
182  } else {
183  policyDynamic = policy_type_dynamic(worksets, team_size, vector_length);
184  policyStatic = policy_type_static(worksets, team_size, vector_length);
185  }
186 
187  // Canonicalize template arguments to avoid redundant instantiations.
188  using w_vec_type = typename WVector::non_const_type;
189  using d_vec_type = typename DVector::const_type;
190  using b_vec_type = typename BVector::const_type;
191  using matrix_type = AMatrix;
192  using x_colMap_vec_type = typename XVector_colMap::const_type;
193  using x_domMap_vec_type = typename XVector_domMap::non_const_type;
194  using scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
195 
196  if (beta == Kokkos::ArithTraits<Scalar>::zero()) {
197  constexpr bool use_beta = false;
198  if (do_X_update) {
199  using functor_type =
200  ChebyshevKernelVectorFunctor<w_vec_type, d_vec_type,
201  b_vec_type, matrix_type,
202  x_colMap_vec_type, x_domMap_vec_type,
203  scalar_type,
204  use_beta,
205  true>;
206  functor_type func(alpha, w, d, b, A, x_colMap, x_domMap, beta, rows_per_team);
207  if (A.nnz() > 10000000)
208  Kokkos::parallel_for(kernel_label, policyDynamic, func);
209  else
210  Kokkos::parallel_for(kernel_label, policyStatic, func);
211  } else {
212  using functor_type =
213  ChebyshevKernelVectorFunctor<w_vec_type, d_vec_type,
214  b_vec_type, matrix_type,
215  x_colMap_vec_type, x_domMap_vec_type,
216  scalar_type,
217  use_beta,
218  false>;
219  functor_type func(alpha, w, d, b, A, x_colMap, x_domMap, beta, rows_per_team);
220  if (A.nnz() > 10000000)
221  Kokkos::parallel_for(kernel_label, policyDynamic, func);
222  else
223  Kokkos::parallel_for(kernel_label, policyStatic, func);
224  }
225  } else {
226  constexpr bool use_beta = true;
227  if (do_X_update) {
228  using functor_type =
229  ChebyshevKernelVectorFunctor<w_vec_type, d_vec_type,
230  b_vec_type, matrix_type,
231  x_colMap_vec_type, x_domMap_vec_type,
232  scalar_type,
233  use_beta,
234  true>;
235  functor_type func(alpha, w, d, b, A, x_colMap, x_domMap, beta, rows_per_team);
236  if (A.nnz() > 10000000)
237  Kokkos::parallel_for(kernel_label, policyDynamic, func);
238  else
239  Kokkos::parallel_for(kernel_label, policyStatic, func);
240  } else {
241  using functor_type =
242  ChebyshevKernelVectorFunctor<w_vec_type, d_vec_type,
243  b_vec_type, matrix_type,
244  x_colMap_vec_type, x_domMap_vec_type,
245  scalar_type,
246  use_beta,
247  false>;
248  functor_type func(alpha, w, d, b, A, x_colMap, x_domMap, beta, rows_per_team);
249  if (A.nnz() > 10000000)
250  Kokkos::parallel_for(kernel_label, policyDynamic, func);
251  else
252  Kokkos::parallel_for(kernel_label, policyStatic, func);
253  }
254  }
255 }
256 
257 } // namespace Impl
258 
259 template <class TpetraOperatorType>
260 ChebyshevKernel<TpetraOperatorType>::
261  ChebyshevKernel(const Teuchos::RCP<const operator_type>& A,
262  const bool useNativeSpMV)
263  : useNativeSpMV_(useNativeSpMV) {
264  setMatrix(A);
265 }
266 
267 template <class TpetraOperatorType>
268 void ChebyshevKernel<TpetraOperatorType>::
269  setMatrix(const Teuchos::RCP<const operator_type>& A) {
270  if (A_op_.get() != A.get()) {
271  A_op_ = A;
272 
273  // We'll (re)allocate these on demand.
274  V1_ = std::unique_ptr<multivector_type>(nullptr);
275 
276  using Teuchos::rcp_dynamic_cast;
278  rcp_dynamic_cast<const crs_matrix_type>(A);
279  if (A_crs.is_null()) {
280  A_crs_ = Teuchos::null;
281  imp_ = Teuchos::null;
282  exp_ = Teuchos::null;
283  X_colMap_ = nullptr;
284  } else {
285  TEUCHOS_ASSERT(A_crs->isFillComplete());
286  A_crs_ = A_crs;
287  auto G = A_crs->getCrsGraph();
288  imp_ = G->getImporter();
289  exp_ = G->getExporter();
290  if (!imp_.is_null()) {
291  if (X_colMap_.get() == nullptr ||
292  !X_colMap_->getMap()->isSameAs(*(imp_->getTargetMap()))) {
293  X_colMap_ = std::unique_ptr<vector_type>(new vector_type(imp_->getTargetMap()));
294  }
295  } else
296  X_colMap_ = nullptr;
297  }
298  }
299 }
300 
301 template <class TpetraOperatorType>
302 void ChebyshevKernel<TpetraOperatorType>::
303  compute(multivector_type& W,
304  const SC& alpha,
305  vector_type& D_inv,
306  multivector_type& B,
307  multivector_type& X,
308  const SC& beta) {
309  using Teuchos::RCP;
310  using Teuchos::rcp;
311 
312  if (canFuse(B)) {
313  // "nonconst" here has no effect other than on the return type.
314  W_vec_ = W.getVectorNonConst(0);
315  B_vec_ = B.getVectorNonConst(0);
316  X_vec_ = X.getVectorNonConst(0);
317  TEUCHOS_ASSERT(!A_crs_.is_null());
318  fusedCase(*W_vec_, alpha, D_inv, *B_vec_, *A_crs_, *X_vec_, beta);
319  } else {
320  TEUCHOS_ASSERT(!A_op_.is_null());
321  unfusedCase(W, alpha, D_inv, B, *A_op_, X, beta);
322  }
323 }
324 
325 template <class TpetraOperatorType>
326 typename ChebyshevKernel<TpetraOperatorType>::vector_type&
327 ChebyshevKernel<TpetraOperatorType>::
328  importVector(vector_type& X_domMap) {
329  if (imp_.is_null()) {
330  return X_domMap;
331  } else {
332  X_colMap_->doImport(X_domMap, *imp_, Tpetra::REPLACE);
333  return *X_colMap_;
334  }
335 }
336 
337 template <class TpetraOperatorType>
338 bool ChebyshevKernel<TpetraOperatorType>::
339  canFuse(const multivector_type& B) const {
340  // If override is enabled
341  if (useNativeSpMV_)
342  return false;
343 
344  // Some criteria must be met for fused kernel
345  return B.getNumVectors() == size_t(1) &&
346  !A_crs_.is_null() &&
347  exp_.is_null();
348 }
349 
350 template <class TpetraOperatorType>
351 void ChebyshevKernel<TpetraOperatorType>::
352  unfusedCase(multivector_type& W,
353  const SC& alpha,
354  vector_type& D_inv,
355  multivector_type& B,
356  const operator_type& A,
357  multivector_type& X,
358  const SC& beta) {
359  using STS = Teuchos::ScalarTraits<SC>;
360  if (V1_.get() == nullptr) {
361  using MV = multivector_type;
362  const size_t numVecs = B.getNumVectors();
363  V1_ = std::unique_ptr<MV>(new MV(B.getMap(), numVecs));
364  }
365  const SC one = Teuchos::ScalarTraits<SC>::one();
366 
367  // V1 = B - A*X
368  Tpetra::deep_copy(*V1_, B);
369  A.apply(X, *V1_, Teuchos::NO_TRANS, -one, one);
370 
371  // W := alpha * D_inv * V1 + beta * W
372  W.elementWiseMultiply(alpha, D_inv, *V1_, beta);
373 
374  // X := X + W
375  X.update(STS::one(), W, STS::one());
376 }
377 
378 template <class TpetraOperatorType>
379 void ChebyshevKernel<TpetraOperatorType>::
380  fusedCase(vector_type& W,
381  const SC& alpha,
382  vector_type& D_inv,
383  vector_type& B,
384  const crs_matrix_type& A,
385  vector_type& X,
386  const SC& beta) {
387  vector_type& X_colMap = importVector(X);
388 
389  using Impl::chebyshev_kernel_vector;
390  using STS = Teuchos::ScalarTraits<SC>;
391 
392  auto A_lcl = A.getLocalMatrixDevice();
393  // D_inv, B, X and W are all Vectors, so it's safe to take the first column only
394  auto Dinv_lcl = Kokkos::subview(D_inv.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
395  auto B_lcl = Kokkos::subview(B.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
396  auto X_domMap_lcl = Kokkos::subview(X.getLocalViewDevice(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0);
397  auto X_colMap_lcl = Kokkos::subview(X_colMap.getLocalViewDevice(Tpetra::Access::ReadOnly), Kokkos::ALL(), 0);
398 
399  const bool do_X_update = !imp_.is_null();
400  if (beta == STS::zero()) {
401  auto W_lcl = Kokkos::subview(W.getLocalViewDevice(Tpetra::Access::OverwriteAll), Kokkos::ALL(), 0);
402  chebyshev_kernel_vector(alpha, W_lcl, Dinv_lcl,
403  B_lcl, A_lcl,
404  X_colMap_lcl, X_domMap_lcl,
405  beta,
406  do_X_update);
407  } else { // need to read _and_ write W if beta != 0
408  auto W_lcl = Kokkos::subview(W.getLocalViewDevice(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0);
409  chebyshev_kernel_vector(alpha, W_lcl, Dinv_lcl,
410  B_lcl, A_lcl,
411  X_colMap_lcl, X_domMap_lcl,
412  beta,
413  do_X_update);
414  }
415  if (!do_X_update)
416  X.update(STS::one(), W, STS::one());
417 }
418 
419 } // namespace Details
420 } // namespace Ifpack2
421 
422 #define IFPACK2_DETAILS_CHEBYSHEVKERNEL_INSTANT(SC, LO, GO, NT) \
423  template class Ifpack2::Details::ChebyshevKernel<Tpetra::Operator<SC, LO, GO, NT> >;
424 
425 #endif // IFPACK2_DETAILS_CHEBYSHEVKERNEL_DEF_HPP
T * get() const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Functor for computing W := alpha * D * (B - A*X) + beta * W and X := X+W.
Definition: Ifpack2_Details_ChebyshevKernel_def.hpp:41
#define TEUCHOS_ASSERT(assertion_test)
bool is_null() const