Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
12 
13 #include <memory>
14 #include <string>
15 #include "TpetraCore_config.h"
16 #include "Kokkos_Core.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Teuchos_ArrayView.hpp"
19 #include "Teuchos_OrdinalTraits.hpp"
20 #include "Teuchos_TimeMonitor.hpp"
28 #include "Tpetra_Details_DefaultTypes.hpp"
30 
51 
52 namespace Tpetra {
53 
54 //
55 // Users must never rely on anything in the Details namespace.
56 //
57 namespace Details {
58 
59 namespace UnpackAndCombineCrsMatrixImpl {
60 
70 template<class ST, class LO, class GO>
71 KOKKOS_FUNCTION int
72 unpackRow(const typename PackTraits<GO>::output_array_type& gids_out,
73  const typename PackTraits<int>::output_array_type& pids_out,
74  const typename PackTraits<ST>::output_array_type& vals_out,
75  const char imports[],
76  const size_t offset,
77  const size_t /* num_bytes */,
78  const size_t num_ent,
79  const size_t bytes_per_value)
80 {
81  if (num_ent == 0) {
82  // Empty rows always take zero bytes, to ensure sparsity.
83  return 0;
84  }
85  bool unpack_pids = pids_out.size() > 0;
86 
87  const size_t num_ent_beg = offset;
88  const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
89 
90  const size_t gids_beg = num_ent_beg + num_ent_len;
91  const size_t gids_len =
92  num_ent * PackTraits<GO>::packValueCount (GO (0));
93 
94  const size_t pids_beg = gids_beg + gids_len;
95  const size_t pids_len = unpack_pids ?
96  size_t (num_ent * PackTraits<int>::packValueCount (int (0))) :
97  size_t (0);
98 
99  const size_t vals_beg = gids_beg + gids_len + pids_len;
100  const size_t vals_len = num_ent * bytes_per_value;
101 
102  const char* const num_ent_in = imports + num_ent_beg;
103  const char* const gids_in = imports + gids_beg;
104  const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
105  const char* const vals_in = imports + vals_beg;
106 
107  size_t num_bytes_out = 0;
108  LO num_ent_out;
109  num_bytes_out += PackTraits<LO>::unpackValue (num_ent_out, num_ent_in);
110  if (static_cast<size_t> (num_ent_out) != num_ent) {
111  return 20; // error code
112  }
113 
114  {
115  Kokkos::pair<int, size_t> p;
116  p = PackTraits<GO>::unpackArray (gids_out.data (), gids_in, num_ent);
117  if (p.first != 0) {
118  return 21; // error code
119  }
120  num_bytes_out += p.second;
121 
122  if (unpack_pids) {
123  p = PackTraits<int>::unpackArray (pids_out.data (), pids_in, num_ent);
124  if (p.first != 0) {
125  return 22; // error code
126  }
127  num_bytes_out += p.second;
128  }
129 
130  p = PackTraits<ST>::unpackArray (vals_out.data (), vals_in, num_ent);
131  if (p.first != 0) {
132  return 23; // error code
133  }
134  num_bytes_out += p.second;
135  }
136 
137  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
138  if (num_bytes_out != expected_num_bytes) {
139  return 24; // error code
140  }
141  return 0; // no errors
142 } //unpackRow
143 
154 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
156  typedef LocalMatrix local_matrix_type;
157  typedef LocalMap local_map_type;
158 
159  typedef typename local_matrix_type::value_type ST;
160  typedef typename local_map_type::local_ordinal_type LO;
161  typedef typename local_map_type::global_ordinal_type GO;
162  typedef typename local_map_type::device_type DT;
163  typedef typename DT::execution_space XS;
164 
165  typedef Kokkos::View<const size_t*, BufferDeviceType>
166  num_packets_per_lid_type;
167  typedef Kokkos::View<const size_t*, DT> offsets_type;
168  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
169  typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
170 
171  typedef Kokkos::View<int, DT> error_type;
172  using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
173 
174  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
175  "LocalMap::local_ordinal_type and "
176  "LocalMatrix::ordinal_type must be the same.");
177 
178  local_matrix_type local_matrix;
179  local_map_type local_col_map;
180  input_buffer_type imports;
181  num_packets_per_lid_type num_packets_per_lid;
182  import_lids_type import_lids;
183  Kokkos::View<const LO*[2], DT> batch_info;
184  offsets_type offsets;
185  Tpetra::CombineMode combine_mode;
186  size_t batch_size;
187  size_t bytes_per_value;
188  bool atomic;
189  error_type error_code;
190 
192  const local_matrix_type& local_matrix_in,
193  const local_map_type& local_col_map_in,
194  const input_buffer_type& imports_in,
195  const num_packets_per_lid_type& num_packets_per_lid_in,
196  const import_lids_type& import_lids_in,
197  const Kokkos::View<const LO*[2], DT>& batch_info_in,
198  const offsets_type& offsets_in,
199  const Tpetra::CombineMode combine_mode_in,
200  const size_t batch_size_in,
201  const size_t bytes_per_value_in,
202  const bool atomic_in) :
203  local_matrix (local_matrix_in),
204  local_col_map (local_col_map_in),
205  imports (imports_in),
206  num_packets_per_lid (num_packets_per_lid_in),
207  import_lids (import_lids_in),
208  batch_info (batch_info_in),
209  offsets (offsets_in),
210  combine_mode (combine_mode_in),
211  batch_size (batch_size_in),
212  bytes_per_value (bytes_per_value_in),
213  atomic (atomic_in),
214  error_code("error")
215  {}
216 
217  KOKKOS_INLINE_FUNCTION
218  void operator()(member_type team_member) const
219  {
220  using Kokkos::View;
221  using Kokkos::subview;
222  using Kokkos::MemoryUnmanaged;
223 
224  const LO batch = team_member.league_rank();
225  const LO lid_no = batch_info(batch, 0);
226  const LO batch_no = batch_info(batch, 1);
227 
228  const size_t num_bytes = num_packets_per_lid(lid_no);
229 
230  // Only unpack data if there is a nonzero number of bytes.
231  if (num_bytes == 0)
232  return;
233 
234  // there is actually something in the row
235  const LO import_lid = import_lids(lid_no);
236  const size_t buf_size = imports.size();
237  const size_t offset = offsets(lid_no);
238 
239  // Get the number of entries to expect in the received data for this row.
240  LO num_ent_LO = 0;
241  const char* const in_buf = imports.data() + offset;
242  (void) PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
243  const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
244 
245  // Count the number of bytes expected to unpack
246  size_t expected_num_bytes = 0;
247  {
248  expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
249  expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
250  expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
251  }
252 
253  if (expected_num_bytes > num_bytes)
254  {
255 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
256 #ifndef KOKKOS_ENABLE_SYCL
257  printf(
258  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
259  "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
260  (int) lid_no, (int) expected_num_bytes, (int) num_bytes
261  );
262 #endif
263  Kokkos::atomic_compare_exchange(error_code.data(), 0, 21);
264  return;
265  }
266 
267  if (offset > buf_size || offset + num_bytes > buf_size)
268  {
269 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
270 #ifndef KOKKOS_ENABLE_SYCL
271  printf(
272  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
273  "At row %d, the offset (%d) > buffer size (%d)\n",
274  (int) lid_no, (int) offset, (int) buf_size
275  );
276 #endif
277  Kokkos::atomic_compare_exchange(error_code.data(), 0, 22);
278  return;
279  }
280 
281  // Determine the number of entries to unpack in this batch
282  size_t num_entries_in_batch = 0;
283  if (num_entries_in_row <= batch_size)
284  num_entries_in_batch = num_entries_in_row;
285  else if (num_entries_in_row >= (batch_no + 1) * batch_size)
286  num_entries_in_batch = batch_size;
287  else
288  num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
289 
290  const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
291  const size_t num_ent_start = offset;
292  const size_t num_ent_end = num_ent_start + bytes_per_lid;
293 
294  const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
295  const size_t gids_start = num_ent_end;
296  const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
297 
298  const size_t vals_start = gids_end;
299 
300  const size_t shift = batch_no * batch_size;
301  const char* const num_ent_in = imports.data() + num_ent_start;
302  const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
303  const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
304 
305  LO num_ent_out;
306  (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
307  if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
308  {
309 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
310 #ifndef KOKKOS_ENABLE_SYCL
311  printf(
312  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
313  "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
314  (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
315  );
316 #endif
317  Kokkos::atomic_compare_exchange(error_code.data(), 0, 23);
318  }
319 
320  constexpr bool matrix_has_sorted_rows = true; // see #6282
321  //Note BMK 6-22: this lambda must use capture-by-value [=] and not capture-by-ref [&].
322  //By ref triggers compiler bug in CUDA 10.
323  Kokkos::parallel_for(
324  Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
325  [=](const LO& j)
326  {
327  size_t distance = 0;
328 
329  GO gid_out;
330  distance = j * bytes_per_gid;
331  (void) PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
332  auto lid_out = local_col_map.getLocalElement(gid_out);
333 
334  // Column indices come in as global indices, in case the
335  // source object's column Map differs from the target object's
336  // (this's) column Map, and must be converted local index values
337 
338  // assume that ST is default constructible
339  ST val_out;
340  distance = j * bytes_per_value;
341  (void) PackTraits<ST>::unpackValue(val_out, vals_in + distance);
342 
343  if (combine_mode == ADD) {
344  // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
345  // different threads don't touch the same row (i.e., no
346  // duplicates in incoming LIDs list).
347  const bool use_atomic_updates = atomic;
348  (void)local_matrix.sumIntoValues(
349  import_lid,
350  &lid_out,
351  1,
352  &val_out,
353  matrix_has_sorted_rows,
354  use_atomic_updates
355  );
356  } else if (combine_mode == REPLACE) {
357  // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
358  // combine mode with multiple incoming rows that touch the same
359  // target matrix entries, so we never need atomic updates.
360  const bool use_atomic_updates = false;
361  (void)local_matrix.replaceValues(
362  import_lid,
363  &lid_out,
364  1,
365  &val_out,
366  matrix_has_sorted_rows,
367  use_atomic_updates
368  );
369  } else {
370  // should never get here
371 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
372 #ifndef KOKKOS_ENABLE_SYCL
373  printf(
374  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
375  "At row %d, an unknown error occurred during unpack\n", (int) lid_no
376  );
377 #endif
378  Kokkos::atomic_compare_exchange(error_code.data(), 0, 31);
379  }
380  }
381  );
382 
383  team_member.team_barrier();
384 
385  }
386 
388  int error() const {
389  auto error_code_h = Kokkos::create_mirror_view_and_copy(
390  Kokkos::HostSpace(), error_code
391  );
392  return error_code_h();
393  }
394 
395 }; //UnpackCrsMatrixAndCombineFunctor
396 
397 struct MaxNumEntTag {};
398 struct TotNumEntTag {};
399 
408 template<class LO, class DT, class BDT>
410 public:
411  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
412  typedef Kokkos::View<const size_t*, DT> offsets_type;
413  typedef Kokkos::View<const char*, BDT> input_buffer_type;
414  // This needs to be public, since it appears in the argument list of
415  // public methods (see below). Otherwise, build errors may happen.
416  typedef size_t value_type;
417 
418 private:
419  num_packets_per_lid_type num_packets_per_lid;
420  offsets_type offsets;
421  input_buffer_type imports;
422 
423 public:
424  NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
425  const offsets_type& offsets_in,
426  const input_buffer_type& imports_in) :
427  num_packets_per_lid (num_packets_per_lid_in),
428  offsets (offsets_in),
429  imports (imports_in)
430  {}
431 
432  KOKKOS_INLINE_FUNCTION void
433  operator() (const MaxNumEntTag, const LO i, value_type& update) const {
434  // Get how many entries to expect in the received data for this row.
435  const size_t num_bytes = num_packets_per_lid(i);
436  if (num_bytes > 0) {
437  LO num_ent_LO = 0; // output argument of unpackValue
438  const char* const in_buf = imports.data () + offsets(i);
439  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
440  const size_t num_ent = static_cast<size_t> (num_ent_LO);
441 
442  update = (update < num_ent) ? num_ent : update;
443  }
444  }
445 
446  KOKKOS_INLINE_FUNCTION void
447  join (const MaxNumEntTag,
448  value_type& dst,
449  const value_type& src) const
450  {
451  if (dst < src) dst = src;
452  }
453 
454  KOKKOS_INLINE_FUNCTION void
455  operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
456  // Get how many entries to expect in the received data for this row.
457  const size_t num_bytes = num_packets_per_lid(i);
458  if (num_bytes > 0) {
459  LO num_ent_LO = 0; // output argument of unpackValue
460  const char* const in_buf = imports.data () + offsets(i);
461  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
462  tot_num_ent += static_cast<size_t> (num_ent_LO);
463  }
464  }
465 }; //NumEntriesFunctor
466 
474 template<class LO, class DT, class BDT>
475 size_t
476 compute_maximum_num_entries (
477  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
478  const Kokkos::View<const size_t*, DT>& offsets,
479  const Kokkos::View<const char*, BDT>& imports)
480 {
481  typedef typename DT::execution_space XS;
482  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
483  MaxNumEntTag> range_policy;
484 
485  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
486  imports);
487  const LO numRowsToUnpack =
488  static_cast<LO> (num_packets_per_lid.extent (0));
489  size_t max_num_ent = 0;
490  Kokkos::parallel_reduce ("Max num entries in CRS",
491  range_policy (0, numRowsToUnpack),
492  functor, max_num_ent);
493  return max_num_ent;
494 }
495 
503 template<class LO, class DT, class BDT>
504 size_t
505 compute_total_num_entries (
506  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
507  const Kokkos::View<const size_t*, DT>& offsets,
508  const Kokkos::View<const char*, BDT>& imports)
509 {
510  typedef typename DT::execution_space XS;
511  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
512  size_t tot_num_ent = 0;
513  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
514  imports);
515  const LO numRowsToUnpack =
516  static_cast<LO> (num_packets_per_lid.extent (0));
517  Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
518  range_policy (0, numRowsToUnpack),
519  functor, tot_num_ent);
520  return tot_num_ent;
521 }
522 
523 template<class LO>
524 KOKKOS_INLINE_FUNCTION
525 size_t
526 unpackRowCount(const char imports[],
527  const size_t offset,
528  const size_t num_bytes)
529 {
530  using PT = PackTraits<LO>;
531 
532  LO num_ent_LO = 0;
533  if (num_bytes > 0) {
534  const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
535  if (p_num_bytes > num_bytes) {
536  return OrdinalTraits<size_t>::invalid();
537  }
538  const char* const in_buf = imports + offset;
539  (void) PT::unpackValue(num_ent_LO, in_buf);
540  }
541  return static_cast<size_t>(num_ent_LO);
542 }
543 
548 template<class View1, class View2>
549 inline
550 bool
551 compute_batch_info(
552  const View1& batches_per_lid,
553  View2& batch_info
554 )
555 {
556  using LO = typename View2::value_type;
557  size_t batch = 0;
558  for (size_t i=0; i<batches_per_lid.extent(0); i++)
559  {
560  for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
561  {
562  batch_info(batch, 0) = static_cast<LO>(i);
563  batch_info(batch, 1) = batch_no;
564  batch++;
565  }
566  }
567  return batch == batch_info.extent(0);
568 }
569 
577 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
578 void
579 unpackAndCombineIntoCrsMatrix(
580  const LocalMatrix& local_matrix,
581  const LocalMap& local_map,
582  const Kokkos::View<const char*, BufferDeviceType>& imports,
583  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
584  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
585  const Tpetra::CombineMode combine_mode)
586 {
587  using ST = typename LocalMatrix::value_type;
588  using LO = typename LocalMap::local_ordinal_type;
589  using DT = typename LocalMap::device_type;
590  using XS = typename DT::execution_space;
591  const char prefix[] =
592  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
593  "unpackAndCombineIntoCrsMatrix: ";
594 
595  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
596  if (num_import_lids == 0) {
597  // Nothing to unpack
598  return;
599  }
600 
601  {
602  // Check for correct input
603  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
604  std::invalid_argument,
605  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
606  "static graph (i.e., was constructed with the CrsMatrix constructor "
607  "that takes a const CrsGraph pointer).");
608 
609  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
610  std::invalid_argument,
611  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
612  "(i.e., was constructed with the CrsMatrix constructor that takes a "
613  "const CrsGraph pointer).");
614 
615  // Unknown combine mode!
616  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
617  std::invalid_argument,
618  prefix << "Invalid combine mode; should never get "
619  "here! Please report this bug to the Tpetra developers.");
620 
621  // Check that sizes of input objects are consistent.
622  bool bad_num_import_lids =
623  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
624  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
625  std::invalid_argument,
626  prefix << "importLIDs.size() (" << num_import_lids << ") != "
627  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
628  } // end QA error checking
629 
630  // Get the offsets
631  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
632  computeOffsetsFromCounts(offsets, num_packets_per_lid);
633 
634  // Determine the sizes of the unpack batches
635  size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
636  const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
637  const size_t batch_size = std::min(default_batch_size, max_num_ent);
638 
639  // To achieve some balance amongst threads, unpack each row in equal size batches
640  size_t num_batches = 0;
641  Kokkos::View<LO*[2], DT> batch_info("", num_batches);
642  Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
643  // Compute meta data that allows batch unpacking
644  Kokkos::parallel_reduce(
645  Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
646  KOKKOS_LAMBDA(const size_t i, size_t& batches)
647  {
648  const size_t num_entries_in_row = unpackRowCount<LO>(
649  imports.data(), offsets(i), num_packets_per_lid(i)
650  );
651  batches_per_lid(i) =
652  (num_entries_in_row <= batch_size) ?
653  1 :
654  num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
655  batches += batches_per_lid(i);
656  },
657  num_batches
658  );
659  Kokkos::resize(batch_info, num_batches);
660 
661  Kokkos::HostSpace host_space;
662  auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
663  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
664  Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
665 
666  auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
667 
668  (void) compute_batch_info(batches_per_lid_h, batch_info_h);
669  // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
670  Kokkos::deep_copy(XS(), batch_info, batch_info_h);
671 
672  // FIXME (TJF SEP 2017)
673  // The scalar type is not necessarily default constructible
674  size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
675 
676  // Now do the actual unpack!
677  const bool atomic = XS().concurrency() != 1;
678  using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
679  functor f(
680  local_matrix,
681  local_map,
682  imports,
683  num_packets_per_lid,
684  import_lids,
685  batch_info,
686  offsets,
687  combine_mode,
688  batch_size,
689  bytes_per_value,
690  atomic
691  );
692 
693  using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
695  if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
696  {
697  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
698  }
699  else
700  {
701  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
702  }
703 
704  auto error_code = f.error();
705  TEUCHOS_TEST_FOR_EXCEPTION(
706  error_code != 0,
707  std::runtime_error,
708  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
709  );
710 } //unpackAndCombineIntoCrsMatrix (Kokkos version)
711 
712 template<class LocalMatrix, class BufferDeviceType>
713 size_t
715  const LocalMatrix& local_matrix,
716  const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
717 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
718  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
719  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
720 #else
721  const Kokkos::View<const char*, BufferDeviceType>& imports,
722  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
723 #endif
724  const size_t num_same_ids)
725 {
726  using Kokkos::parallel_reduce;
727  typedef typename LocalMatrix::ordinal_type LO;
728  typedef typename LocalMatrix::device_type device_type;
729  typedef typename device_type::execution_space XS;
730  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
731  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
732  typedef BufferDeviceType BDT;
733 
734  size_t count = 0;
735  LO num_items;
736 
737  // Number of matrix entries to unpack (returned by this function).
738  num_items = static_cast<LO>(num_same_ids);
739  if (num_items) {
740  size_t kcnt = 0;
741  parallel_reduce(range_policy(0, num_items),
742  KOKKOS_LAMBDA(const LO lid, size_t& update) {
743  update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
744  -local_matrix.graph.row_map[lid]);
745  }, kcnt);
746  count += kcnt;
747  }
748 
749  // Count entries copied directly from the source matrix with permuting.
750  num_items = static_cast<LO>(permute_from_lids.extent(0));
751  if (num_items) {
752  size_t kcnt = 0;
753  parallel_reduce(range_policy(0, num_items),
754  KOKKOS_LAMBDA(const LO i, size_t& update) {
755  const LO lid = permute_from_lids(i);
756  update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
757  - local_matrix.graph.row_map[lid]);
758  }, kcnt);
759  count += kcnt;
760  }
761 
762  {
763  // Count entries received from other MPI processes.
764  const size_type np = num_packets_per_lid.extent(0);
765  Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
766  computeOffsetsFromCounts(offsets, num_packets_per_lid);
767  count +=
768  compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
769  offsets, imports);
770  }
771 
772  return count;
773 } //unpackAndCombineWithOwningPIDsCount (Kokkos version)
774 
776 template<class LO, class DT, class BDT>
777 int
778 setupRowPointersForRemotes(
779  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
780  const typename PackTraits<LO>::input_array_type& import_lids,
781  const Kokkos::View<const char*, BDT>& imports,
782  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
783  const typename PackTraits<size_t>::input_array_type& offsets)
784 {
785  using Kokkos::parallel_reduce;
786  typedef typename DT::execution_space XS;
787  typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
788  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
789 
790  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
791  const size_type N = num_packets_per_lid.extent(0);
792 
793  int errors = 0;
794  parallel_reduce ("Setup row pointers for remotes",
795  range_policy (0, N),
796  KOKKOS_LAMBDA (const size_t i, int& k_error) {
797  typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
798  const size_t num_bytes = num_packets_per_lid(i);
799  const size_t offset = offsets(i);
800  const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
801  if (num_ent == InvalidNum) {
802  k_error += 1;
803  }
804  Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
805  }, errors);
806  return errors;
807 }
808 
809 // Convert array of row lengths to a CRS pointer array
810 template<class DT>
811 void
812 makeCrsRowPtrFromLengths(
813  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
814  const Kokkos::View<size_t*,DT>& new_start_row)
815 {
816  using Kokkos::parallel_scan;
817  typedef typename DT::execution_space XS;
818  typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
819  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
820  const size_type N = new_start_row.extent(0);
821  parallel_scan(range_policy(0, N),
822  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
823  auto cur_val = tgt_rowptr(i);
824  if (final) {
825  tgt_rowptr(i) = update;
826  new_start_row(i) = tgt_rowptr(i);
827  }
828  update += cur_val;
829  }
830  );
831 }
832 
833 template<class LocalMatrix, class LocalMap>
834 void
835 copyDataFromSameIDs(
836  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
837  const typename PackTraits<int>::output_array_type& tgt_pids,
838  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
839  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
840  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
841  const typename PackTraits<int>::input_array_type& src_pids,
842  const LocalMatrix& local_matrix,
843  const LocalMap& local_col_map,
844  const size_t num_same_ids,
845  const int my_pid)
846 {
847  using Kokkos::parallel_for;
848  typedef typename LocalMap::device_type DT;
849  typedef typename LocalMap::local_ordinal_type LO;
850  typedef typename DT::execution_space XS;
851  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
852 
853  parallel_for(range_policy(0, num_same_ids),
854  KOKKOS_LAMBDA(const size_t i) {
855  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
856 
857  const LO src_lid = static_cast<LO>(i);
858  size_t src_row = local_matrix.graph.row_map(src_lid);
859 
860  const LO tgt_lid = static_cast<LO>(i);
861  const size_t tgt_row = tgt_rowptr(tgt_lid);
862 
863  const size_t nsr = local_matrix.graph.row_map(src_lid+1)
864  - local_matrix.graph.row_map(src_lid);
865  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
866 
867  for (size_t j=local_matrix.graph.row_map(src_lid);
868  j<local_matrix.graph.row_map(src_lid+1); ++j) {
869  LO src_col = local_matrix.graph.entries(j);
870  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
871  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
872  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
873  }
874  }
875  );
876 }
877 
878 template<class LocalMatrix, class LocalMap>
879 void
880 copyDataFromPermuteIDs(
881  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
882  const typename PackTraits<int>::output_array_type& tgt_pids,
883  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
884  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
885  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
886  const typename PackTraits<int>::input_array_type& src_pids,
887  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
888  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
889  const LocalMatrix& local_matrix,
890  const LocalMap& local_col_map,
891  const int my_pid)
892 {
893  using Kokkos::parallel_for;
894  typedef typename LocalMap::device_type DT;
895  typedef typename LocalMap::local_ordinal_type LO;
896  typedef typename DT::execution_space XS;
897  typedef typename PackTraits<LO>::input_array_type::size_type size_type;
898  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
899 
900  const size_type num_permute_to_lids = permute_to_lids.extent(0);
901 
902  parallel_for(range_policy(0, num_permute_to_lids),
903  KOKKOS_LAMBDA(const size_t i) {
904  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
905 
906  const LO src_lid = permute_from_lids(i);
907  const size_t src_row = local_matrix.graph.row_map(src_lid);
908 
909  const LO tgt_lid = permute_to_lids(i);
910  const size_t tgt_row = tgt_rowptr(tgt_lid);
911 
912  size_t nsr = local_matrix.graph.row_map(src_lid+1)
913  - local_matrix.graph.row_map(src_lid);
914  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
915 
916  for (size_t j=local_matrix.graph.row_map(src_lid);
917  j<local_matrix.graph.row_map(src_lid+1); ++j) {
918  LO src_col = local_matrix.graph.entries(j);
919  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
920  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
921  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
922  }
923  }
924  );
925 }
926 
927 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
928 int
929 unpackAndCombineIntoCrsArrays2(
930  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
931  const typename PackTraits<int>::output_array_type& tgt_pids,
932  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
933  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
934  const typename PackTraits<size_t>::input_array_type& offsets,
935  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
936 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
937  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
938  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
939 #else
940  const Kokkos::View<const char*, BufferDeviceType>& imports,
941  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
942 #endif
943  const LocalMatrix& /* local_matrix */,
944  const LocalMap /*& local_col_map*/,
945  const int my_pid,
946  const size_t bytes_per_value)
947 {
948  using Kokkos::View;
949  using Kokkos::subview;
950  using Kokkos::MemoryUnmanaged;
951  using Kokkos::parallel_reduce;
952  using Kokkos::atomic_fetch_add;
954  typedef typename LocalMap::device_type DT;
955  typedef typename LocalMap::local_ordinal_type LO;
956  typedef typename LocalMap::global_ordinal_type GO;
957  typedef typename LocalMatrix::value_type ST;
958  typedef typename DT::execution_space XS;
959  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
960  typedef typename Kokkos::pair<size_type, size_type> slice;
961  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
962 
963  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
964  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
965  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
966 
967  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
968 
969  int errors = 0;
970  const size_type num_import_lids = import_lids.size();
971 
972  // RemoteIDs: Loop structure following UnpackAndCombine
973  parallel_reduce ("Unpack and combine into CRS",
974  range_policy (0, num_import_lids),
975  KOKKOS_LAMBDA (const size_t i, int& k_error) {
976  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
977  const size_t num_bytes = num_packets_per_lid(i);
978  const size_t offset = offsets(i);
979  if (num_bytes == 0) {
980  // Empty buffer means that the row is empty.
981  return;
982  }
983  size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
984  if (num_ent == InvalidNum) {
985  k_error += 1;
986  return;
987  }
988  const LO lcl_row = import_lids(i);
989  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
990  const size_t end_row = start_row + num_ent;
991 
992  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
993  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
994  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
995 
996  k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
997  imports.data(), offset, num_bytes,
998  num_ent, bytes_per_value);
999 
1000  // Correct target PIDs.
1001  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1002  const int pid = pids_out(j);
1003  pids_out(j) = (pid != my_pid) ? pid : -1;
1004  }
1005  }, errors);
1006 
1007  return errors;
1008 }
1009 
1010 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1011 void
1013  const LocalMatrix & local_matrix,
1014  const LocalMap & local_col_map,
1015  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1016 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1017  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1018  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1019 #else
1020  const Kokkos::View<const char*, BufferDeviceType>& imports,
1021  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1022 #endif
1023  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1024  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1025  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1026  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1027  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1028  const typename PackTraits<int>::input_array_type& src_pids,
1029  const typename PackTraits<int>::output_array_type& tgt_pids,
1030  const size_t num_same_ids,
1031  const size_t tgt_num_rows,
1032  const size_t tgt_num_nonzeros,
1033  const int my_tgt_pid,
1034  const size_t bytes_per_value)
1035 {
1036  using Kokkos::View;
1037  using Kokkos::subview;
1038  using Kokkos::parallel_for;
1039  using Kokkos::MemoryUnmanaged;
1041  typedef typename LocalMap::device_type DT;
1042  typedef typename LocalMap::local_ordinal_type LO;
1043  typedef typename DT::execution_space XS;
1044  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1045  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1046  typedef BufferDeviceType BDT;
1047 
1048  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1049 
1050  const size_t N = tgt_num_rows;
1051 
1052  // In the case of reduced communicators, the sourceMatrix won't have
1053  // the right "my_pid", so thus we have to supply it.
1054  const int my_pid = my_tgt_pid;
1055 
1056  // Zero the rowptr
1057  parallel_for(range_policy(0, N+1),
1058  KOKKOS_LAMBDA(const size_t i) {
1059  tgt_rowptr(i) = 0;
1060  }
1061  );
1062 
1063  // same IDs: Always first, always in the same place
1064  parallel_for(range_policy(0, num_same_ids),
1065  KOKKOS_LAMBDA(const size_t i) {
1066  const LO tgt_lid = static_cast<LO>(i);
1067  const LO src_lid = static_cast<LO>(i);
1068  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1069  - local_matrix.graph.row_map(src_lid);
1070  }
1071  );
1072 
1073  // Permute IDs: Still local, but reordered
1074  const size_type num_permute_to_lids = permute_to_lids.extent(0);
1075  parallel_for(range_policy(0, num_permute_to_lids),
1076  KOKKOS_LAMBDA(const size_t i) {
1077  const LO tgt_lid = permute_to_lids(i);
1078  const LO src_lid = permute_from_lids(i);
1079  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1080  - local_matrix.graph.row_map(src_lid);
1081  }
1082  );
1083 
1084  // Get the offsets from the number of packets per LID
1085  const size_type num_import_lids = import_lids.extent(0);
1086  View<size_t*, DT> offsets("offsets", num_import_lids+1);
1087  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1088 
1089 #ifdef HAVE_TPETRA_DEBUG
1090  {
1091  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1092  const bool condition =
1093  nth_offset_h != static_cast<size_t>(imports.extent (0));
1094  TEUCHOS_TEST_FOR_EXCEPTION
1095  (condition, std::logic_error, prefix
1096  << "The final offset in bytes " << nth_offset_h
1097  << " != imports.size() = " << imports.extent(0)
1098  << ". Please report this bug to the Tpetra developers.");
1099  }
1100 #endif // HAVE_TPETRA_DEBUG
1101 
1102  // Setup row pointers for remotes
1103  int k_error =
1104  setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1105  import_lids, imports, num_packets_per_lid, offsets);
1106  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1107  << " Error transferring data to target row pointers. "
1108  "Please report this bug to the Tpetra developers.");
1109 
1110  // If multiple processes contribute to the same row, we may need to
1111  // update row offsets. This tracks that.
1112  View<size_t*, DT> new_start_row ("new_start_row", N+1);
1113 
1114  // Turn row length into a real CRS row pointer
1115  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1116 
1117  // SameIDs: Copy the data over
1118  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1119  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1120 
1121  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1122  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1123  local_matrix, local_col_map, my_pid);
1124 
1125  if (imports.extent(0) <= 0) {
1126  return;
1127  }
1128 
1129  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1130  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1131  local_matrix, local_col_map, my_pid, bytes_per_value);
1132  TEUCHOS_TEST_FOR_EXCEPTION(
1133  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1134  "should never happen. Please report this bug to the Tpetra developers.");
1135 
1136  return;
1137 }
1138 
1139 } // namespace UnpackAndCombineCrsMatrixImpl
1140 
1180 template<typename ST, typename LO, typename GO, typename Node>
1181 void
1183  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1184  const Teuchos::ArrayView<const char>& imports,
1185  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1186  const Teuchos::ArrayView<const LO>& importLIDs,
1187  size_t /* constantNumPackets */,
1188  CombineMode combineMode)
1189 {
1190  using Kokkos::View;
1191  typedef typename Node::device_type device_type;
1192  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1193  static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1194  "Node::device_type and LocalMatrix::device_type must be the same.");
1195 
1196  // Convert all Teuchos::Array to Kokkos::View.
1197  device_type outputDevice;
1198 
1199  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1200  // them to device. Since unpacking is done directly in to the local matrix
1201  // (lclMatrix), no copying needs to be performed after unpacking.
1202  auto num_packets_per_lid_d =
1203  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1204  numPacketsPerLID.size(), true, "num_packets_per_lid");
1205 
1206  auto import_lids_d =
1207  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1208  importLIDs.size(), true, "import_lids");
1209 
1210  auto imports_d =
1211  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1212  imports.size(), true, "imports");
1213 
1214  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1215  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1216 
1217 //KDDKDD This loop doesn't appear to do anything; what is it?
1218 //KDDKDD for (int i=0; i<importLIDs.size(); i++)
1219 //KDDKDD {
1220 //KDDKDD auto lclRow = importLIDs[i];
1221 //KDDKDD Teuchos::ArrayView<const LO> A_indices;
1222 //KDDKDD Teuchos::ArrayView<const ST> A_values;
1223 //KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1224 //KDDKDD }
1225  // Now do the actual unpack!
1226  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1227  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1228  import_lids_d, combineMode);
1229 
1230 }
1231 
1232 template<typename ST, typename LO, typename GO, typename NT>
1233 void
1234 unpackCrsMatrixAndCombineNew(
1235  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1236  Kokkos::DualView<char*,
1238  Kokkos::DualView<size_t*,
1239  typename DistObject<char, LO, GO, NT>::buffer_device_type> numPacketsPerLID,
1240  const Kokkos::DualView<const LO*,
1242  const size_t /* constantNumPackets */,
1243  const CombineMode combineMode)
1244 {
1245  using Kokkos::View;
1246  using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1247  using dist_object_type = DistObject<char, LO, GO, NT>;
1248  using device_type = typename crs_matrix_type::device_type;
1249  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1250  using buffer_device_type = typename dist_object_type::buffer_device_type;
1251 
1252  static_assert
1253  (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1254  "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1255  "must be the same.");
1256 
1257  if (numPacketsPerLID.need_sync_device()) {
1258  numPacketsPerLID.sync_device ();
1259  }
1260  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1261 
1262  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1263  auto import_lids_d = importLIDs.view_device ();
1264 
1265  if (imports.need_sync_device()) {
1266  imports.sync_device ();
1267  }
1268  auto imports_d = imports.view_device ();
1269 
1270  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1271  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1272  typedef decltype (local_col_map) local_map_type;
1273 
1274  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1275  local_matrix_device_type,
1276  local_map_type,
1277  buffer_device_type
1278  > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1279  import_lids_d, combineMode);
1280 }
1281 
1328 //
1337 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1338 size_t
1340  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1341  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1342  const Teuchos::ArrayView<const char> &imports,
1343  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1344  size_t /* constantNumPackets */,
1345  CombineMode /* combineMode */,
1346  size_t numSameIDs,
1347  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1348  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1349 {
1350  using Kokkos::MemoryUnmanaged;
1351  using Kokkos::View;
1352  typedef typename Node::device_type DT;
1353  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1354 
1355  TEUCHOS_TEST_FOR_EXCEPTION
1356  (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1357  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1358  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1359  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1360  // process, then the matrix is neither locally nor globally indexed.
1361  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1362  TEUCHOS_TEST_FOR_EXCEPTION
1363  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1364  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1365  TEUCHOS_TEST_FOR_EXCEPTION
1366  (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1367  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1368  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1369 
1370  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1371 
1372  using kokkos_device_type = Kokkos::Device<typename Node::device_type::execution_space,
1373  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1374 
1375 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1376  Kokkos::View<LocalOrdinal const *, kokkos_device_type, void, void > permute_from_lids_d =
1377 #else
1378  Kokkos::View<LocalOrdinal const *, kokkos_device_type> permute_from_lids_d =
1379 #endif
1381  permuteFromLIDs.getRawPtr (),
1382  permuteFromLIDs.size (), true,
1383  "permute_from_lids");
1384 
1385 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1386  Kokkos::View<const char*, kokkos_device_type, void, void > imports_d =
1387 #else
1388  Kokkos::View<const char*, kokkos_device_type> imports_d =
1389 #endif
1391  imports.getRawPtr (),
1392  imports.size (), true,
1393  "imports");
1394 
1395 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1396  Kokkos::View<const size_t*, kokkos_device_type, void, void > num_packets_per_lid_d =
1397 #else
1398  Kokkos::View<const size_t*, kokkos_device_type> num_packets_per_lid_d =
1399 #endif
1401  numPacketsPerLID.getRawPtr (),
1402  numPacketsPerLID.size (), true,
1403  "num_packets_per_lid");
1404 
1406  local_matrix, permute_from_lids_d, imports_d,
1407  num_packets_per_lid_d, numSameIDs);
1408 } //unpackAndCombineWithOwningPIDsCount (Teuchos::Array version)
1409 
1424 
1425 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1426 void
1429  const Kokkos::View<LocalOrdinal const *,
1430  Kokkos::Device<typename Node::device_type::execution_space,
1431  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1432 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1433  , void, void
1434 #endif
1435  > import_lids_d,
1436  const Kokkos::View<const char*,
1437  Kokkos::Device<typename Node::device_type::execution_space,
1438  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1439 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1440  , void, void
1441 #endif
1442  > imports_d,
1443  const Kokkos::View<const size_t*,
1444  Kokkos::Device<typename Node::device_type::execution_space,
1445  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1446 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1447  , void, void
1448 #endif
1449  > num_packets_per_lid_d,
1450  const size_t numSameIDs,
1451  const Kokkos::View<LocalOrdinal const *,
1452  Kokkos::Device<typename Node::device_type::execution_space,
1453  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1454 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1455  , void, void
1456 #endif
1457  > permute_to_lids_d,
1458  const Kokkos::View<LocalOrdinal const *,
1459  Kokkos::Device<typename Node::device_type::execution_space,
1460  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1461 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1462  , void, void
1463 #endif
1464  > permute_from_lids_d,
1465  size_t TargetNumRows,
1466  const int MyTargetPID,
1467  Kokkos::View<size_t*,typename Node::device_type> &crs_rowptr_d,
1468  Kokkos::View<GlobalOrdinal*,typename Node::device_type> &crs_colind_d,
1469  Kokkos::View<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*,typename Node::device_type>& crs_vals_d,
1470  const Teuchos::ArrayView<const int>& SourcePids,
1471  Kokkos::View<int*,typename Node::device_type> &TargetPids)
1472 {
1473  using execution_space = typename Node::execution_space;
1475 
1476  using Kokkos::View;
1477  using Kokkos::deep_copy;
1478 
1479  using Teuchos::ArrayView;
1480  using Teuchos::outArg;
1481  using Teuchos::REDUCE_MAX;
1482  using Teuchos::reduceAll;
1483 
1484  typedef typename Node::device_type DT;
1485 
1487  typedef typename matrix_type::impl_scalar_type ST;
1488 
1489  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1490 # ifdef HAVE_TPETRA_MMM_TIMINGS
1491  using Teuchos::TimeMonitor;
1492  Teuchos::RCP<TimeMonitor> tm;
1493 # endif
1494 
1495  using Kokkos::MemoryUnmanaged;
1496 
1497  TEUCHOS_TEST_FOR_EXCEPTION
1498  (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1499  prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != "
1500  "permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1501  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1502  // process, then the matrix is neither locally nor globally indexed.
1503  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1504  TEUCHOS_TEST_FOR_EXCEPTION
1505  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1506  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1507  TEUCHOS_TEST_FOR_EXCEPTION
1508  (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1509  prefix << "import_lids_d.size() = " << import_lids_d.size () << " != "
1510  "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << ".");
1511 
1512  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1513 
1514  // TargetNumNonzeros is number of nonzeros in local matrix.
1515 # ifdef HAVE_TPETRA_MMM_TIMINGS
1516  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1517 # endif
1518  size_t TargetNumNonzeros =
1520  local_matrix, permute_from_lids_d, imports_d,
1521  num_packets_per_lid_d, numSameIDs);
1522 # ifdef HAVE_TPETRA_MMM_TIMINGS
1523  tm = Teuchos::null;
1524 # endif
1525 
1526 # ifdef HAVE_TPETRA_MMM_TIMINGS
1527  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1528 # endif
1529  Kokkos::resize(crs_rowptr_d,TargetNumRows+1);
1530  Kokkos::resize(crs_colind_d,TargetNumNonzeros);
1531  Kokkos::resize(crs_vals_d,TargetNumNonzeros);
1532 # ifdef HAVE_TPETRA_MMM_TIMINGS
1533  tm = Teuchos::null;
1534 # endif
1535 
1536  TEUCHOS_TEST_FOR_EXCEPTION(
1537  permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1538  prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size ()
1539  << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << ".");
1540 
1541  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1542  Kokkos::resize(TargetPids,TargetNumNonzeros);
1543  }
1544  Kokkos::deep_copy(execution_space(), TargetPids, -1);
1545 
1546  // Grab pointers for sourceMatrix
1547  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1548 
1549 # ifdef HAVE_TPETRA_MMM_TIMINGS
1550  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1551 # endif
1552  // Convert input arrays to Kokkos::Views
1553  DT outputDevice;
1554 
1555  auto src_pids_d =
1556  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1557  SourcePids.size(), true, "src_pids");
1558 
1559 # ifdef HAVE_TPETRA_MMM_TIMINGS
1560  tm = Teuchos::null;
1561 # endif
1562 
1563  size_t bytes_per_value = 0;
1565  // assume that ST is default constructible
1566  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1567  }
1568  else {
1569  // Since the packed data come from the source matrix, we can use the source
1570  // matrix to get the number of bytes per Scalar value stored in the matrix.
1571  // This assumes that all Scalar values in the source matrix require the same
1572  // number of bytes. If the source matrix has no entries on the calling
1573  // process, then we hope that some process does have some idea how big
1574  // a Scalar value is. Of course, if no processes have any entries, then no
1575  // values should be packed (though this does assume that in our packing
1576  // scheme, rows with zero entries take zero bytes).
1577  size_t bytes_per_value_l = 0;
1578  if (local_matrix.values.extent(0) > 0) {
1579  const ST& val = local_matrix.values(0);
1580  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1581  } else {
1582  const ST& val = crs_vals_d(0);
1583  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1584  }
1585  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1586  Teuchos::REDUCE_MAX,
1587  bytes_per_value_l,
1588  outArg(bytes_per_value));
1589  }
1590 
1591 # ifdef HAVE_TPETRA_MMM_TIMINGS
1592  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1593 # endif
1595  local_matrix, local_col_map, import_lids_d, imports_d,
1596  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1597  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1598  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1599  bytes_per_value);
1600 # ifdef HAVE_TPETRA_MMM_TIMINGS
1601  tm = Teuchos::null;
1602 # endif
1603 
1604  // Copy outputs back to host
1605 # ifdef HAVE_TPETRA_MMM_TIMINGS
1606  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1607 # endif
1608 
1609  Kokkos::parallel_for("setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,TargetPids.size()), KOKKOS_LAMBDA (const size_t i) {
1610  if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1611  });
1612 
1613 } //unpackAndCombineIntoCrsArrays
1614 
1615 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1616 void
1619  const Kokkos::View<LocalOrdinal const *,
1620  Kokkos::Device<typename Node::device_type::execution_space,
1621  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1622 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1623  , void, void
1624 #endif
1625  > import_lids_d,
1626  const Kokkos::View<const char*,
1627  Kokkos::Device<typename Node::device_type::execution_space,
1628  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1629 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1630  , void, void
1631 #endif
1632  > imports_d,
1633  const Kokkos::View<const size_t*,
1634  Kokkos::Device<typename Node::device_type::execution_space,
1635  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1636 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1637  , void, void
1638 #endif
1639  > num_packets_per_lid_d,
1640  const size_t numSameIDs,
1641  const Kokkos::View<LocalOrdinal const *,
1642  Kokkos::Device<typename Node::device_type::execution_space,
1643  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1644 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1645  , void, void
1646 #endif
1647  > permute_to_lids_d,
1648  const Kokkos::View<LocalOrdinal const *,
1649  Kokkos::Device<typename Node::device_type::execution_space,
1650  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1651 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1652  , void, void
1653 #endif
1654  > permute_from_lids_d,
1655  size_t TargetNumRows,
1656  const int MyTargetPID,
1657  Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1658  Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1659  Teuchos::ArrayRCP<Scalar>& CRS_vals,
1660  const Teuchos::ArrayView<const int>& SourcePids,
1661  Teuchos::Array<int>& TargetPids)
1662 {
1663  using execution_space = typename Node::execution_space;
1665 
1666  using Kokkos::View;
1667  using Kokkos::deep_copy;
1668 
1669  using Teuchos::ArrayView;
1670  using Teuchos::outArg;
1671  using Teuchos::REDUCE_MAX;
1672  using Teuchos::reduceAll;
1673 
1674  typedef typename Node::device_type DT;
1675 
1677  typedef typename matrix_type::impl_scalar_type ST;
1678 
1679  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1680 # ifdef HAVE_TPETRA_MMM_TIMINGS
1681  using Teuchos::TimeMonitor;
1682  Teuchos::RCP<TimeMonitor> tm;
1683 # endif
1684 
1685  using Kokkos::MemoryUnmanaged;
1686 
1687  TEUCHOS_TEST_FOR_EXCEPTION
1688  (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1689  prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != "
1690  "permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1691  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1692  // process, then the matrix is neither locally nor globally indexed.
1693  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1694  TEUCHOS_TEST_FOR_EXCEPTION
1695  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1696  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1697  TEUCHOS_TEST_FOR_EXCEPTION
1698  (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1699  prefix << "import_lids_d.size() = " << import_lids_d.size () << " != "
1700  "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << ".");
1701 
1702  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1703 
1704  // TargetNumNonzeros is number of nonzeros in local matrix.
1705 # ifdef HAVE_TPETRA_MMM_TIMINGS
1706  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1707 # endif
1708  size_t TargetNumNonzeros =
1710  local_matrix, permute_from_lids_d, imports_d,
1711  num_packets_per_lid_d, numSameIDs);
1712 # ifdef HAVE_TPETRA_MMM_TIMINGS
1713  tm = Teuchos::null;
1714 # endif
1715 
1716 # ifdef HAVE_TPETRA_MMM_TIMINGS
1717  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1718 # endif
1719  CRS_rowptr.resize (TargetNumRows+1);
1720  CRS_colind.resize(TargetNumNonzeros);
1721  CRS_vals.resize(TargetNumNonzeros);
1722  Teuchos::ArrayRCP<ST> const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1723 # ifdef HAVE_TPETRA_MMM_TIMINGS
1724  tm = Teuchos::null;
1725 # endif
1726 
1727  TEUCHOS_TEST_FOR_EXCEPTION(
1728  permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1729  prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size ()
1730  << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << ".");
1731 
1732  // Preseed TargetPids with -1 for local
1733  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1734  TargetPids.resize (TargetNumNonzeros);
1735  }
1736  TargetPids.assign (TargetNumNonzeros, -1);
1737 
1738  // Grab pointers for sourceMatrix
1739  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1740 
1741 # ifdef HAVE_TPETRA_MMM_TIMINGS
1742  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1743 # endif
1744  // Convert input arrays to Kokkos::Views
1745  DT outputDevice;
1746 
1747  auto crs_rowptr_d =
1748  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1749  CRS_rowptr.size(), true, "crs_rowptr");
1750 
1751  auto crs_colind_d =
1752  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1753  CRS_colind.size(), true, "crs_colidx");
1754 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1755  static_assert (! std::is_same<
1756  typename std::remove_const<
1757  typename std::decay<
1758  decltype (CRS_vals_impl_scalar_type)
1759  >::type::value_type
1760  >::type,
1761  std::complex<double> >::value,
1762  "CRS_vals::value_type is std::complex<double>; this should never happen"
1763  ", since std::complex does not work in Kokkos::View objects.");
1764 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1765 
1766  auto crs_vals_d =
1767  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals_impl_scalar_type.getRawPtr(),
1768  CRS_vals_impl_scalar_type.size(), true, "crs_vals");
1769 
1770 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1771  static_assert (! std::is_same<
1772  typename decltype (crs_vals_d)::non_const_value_type,
1773  std::complex<double> >::value,
1774  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1775  "never happen, since std::complex does not work in Kokkos::View objects.");
1776 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1777 
1778  auto src_pids_d =
1779  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1780  SourcePids.size(), true, "src_pids");
1781 
1782  auto tgt_pids_d =
1783  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1784  TargetPids.size(), true, "tgt_pids");
1785 
1786 # ifdef HAVE_TPETRA_MMM_TIMINGS
1787  tm = Teuchos::null;
1788 # endif
1789 
1790  size_t bytes_per_value = 0;
1792  // assume that ST is default constructible
1793  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1794  }
1795  else {
1796  // Since the packed data come from the source matrix, we can use the source
1797  // matrix to get the number of bytes per Scalar value stored in the matrix.
1798  // This assumes that all Scalar values in the source matrix require the same
1799  // number of bytes. If the source matrix has no entries on the calling
1800  // process, then we hope that some process does have some idea how big
1801  // a Scalar value is. Of course, if no processes have any entries, then no
1802  // values should be packed (though this does assume that in our packing
1803  // scheme, rows with zero entries take zero bytes).
1804  size_t bytes_per_value_l = 0;
1805  if (local_matrix.values.extent(0) > 0) {
1806  const ST& val = local_matrix.values(0);
1807  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1808  } else {
1809  const ST& val = crs_vals_d(0);
1810  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1811  }
1812  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1813  Teuchos::REDUCE_MAX,
1814  bytes_per_value_l,
1815  outArg(bytes_per_value));
1816  }
1817 
1818 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1819  static_assert (! std::is_same<
1820  typename decltype (crs_vals_d)::non_const_value_type,
1821  std::complex<double> >::value,
1822  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1823  "never happen, since std::complex does not work in Kokkos::View objects.");
1824 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1825 
1826 # ifdef HAVE_TPETRA_MMM_TIMINGS
1827  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1828 # endif
1830  local_matrix, local_col_map, import_lids_d, imports_d,
1831  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1832  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1833  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1834  bytes_per_value);
1835 # ifdef HAVE_TPETRA_MMM_TIMINGS
1836  tm = Teuchos::null;
1837 # endif
1838 
1839  // Copy outputs back to host
1840 # ifdef HAVE_TPETRA_MMM_TIMINGS
1841  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1842 # endif
1843  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1844  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1845  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1846  deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1847 
1848  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1849  CRS_colind.getRawPtr(), CRS_colind.size());
1850  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1851  deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1852 
1853  typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1854  CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1855  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1856  deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1857 
1858  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1859  TargetPids.getRawPtr(), TargetPids.size());
1860  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1861  deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1862 
1863 } //unpackAndCombineIntoCrsArrays
1864 
1865 
1866 } // namespace Details
1867 } // namespace Tpetra
1868 
1869 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON( ST, LO, GO, NT ) \
1870  template void \
1871  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1872  const CrsMatrix<ST, LO, GO, NT>&, \
1873  const Teuchos::ArrayView<const char>&, \
1874  const Teuchos::ArrayView<const size_t>&, \
1875  const Teuchos::ArrayView<const LO>&, \
1876  size_t, \
1877  CombineMode); \
1878  template size_t \
1879  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1880  const CrsMatrix<ST, LO, GO, NT> &, \
1881  const Teuchos::ArrayView<const LO> &, \
1882  const Teuchos::ArrayView<const char> &, \
1883  const Teuchos::ArrayView<const size_t>&, \
1884  size_t, \
1885  CombineMode, \
1886  size_t, \
1887  const Teuchos::ArrayView<const LO>&, \
1888  const Teuchos::ArrayView<const LO>&); \
1889  template void \
1890  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1891  const CrsMatrix<ST, LO, GO, NT>&, \
1892  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1893  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1894  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1895  const size_t, \
1896  const CombineMode); \
1897  template void \
1898  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1899  const CrsMatrix<ST, LO, GO, NT> &, \
1900  const Kokkos::View<LO const *, \
1901  Kokkos::Device<typename NT::device_type::execution_space, \
1902  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1903  void, void >, \
1904  const Kokkos::View<const char*, \
1905  Kokkos::Device<typename NT::device_type::execution_space, \
1906  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1907  void, void >, \
1908  const Kokkos::View<const size_t*, \
1909  Kokkos::Device<typename NT::device_type::execution_space, \
1910  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1911  void, void >, \
1912  const size_t, \
1913  const Kokkos::View<LO const *, \
1914  Kokkos::Device<typename NT::device_type::execution_space, \
1915  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1916  void, void >, \
1917  const Kokkos::View<LO const *, \
1918  Kokkos::Device<typename NT::device_type::execution_space, \
1919  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1920  void, void >, \
1921  size_t, \
1922  const int, \
1923  Kokkos::View<size_t*,typename NT::device_type>&, \
1924  Kokkos::View<GO*,typename NT::device_type>&, \
1925  Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1926  const Teuchos::ArrayView<const int>&, \
1927  Kokkos::View<int*,typename NT::device_type>&); \
1928  template void \
1929  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1930  const CrsMatrix<ST, LO, GO, NT> &, \
1931  const Kokkos::View<LO const *, \
1932  Kokkos::Device<typename NT::device_type::execution_space, \
1933  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1934  void, void >, \
1935  const Kokkos::View<const char*, \
1936  Kokkos::Device<typename NT::device_type::execution_space, \
1937  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1938  void, void >, \
1939  const Kokkos::View<const size_t*, \
1940  Kokkos::Device<typename NT::device_type::execution_space, \
1941  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1942  void, void >, \
1943  const size_t, \
1944  const Kokkos::View<LO const *, \
1945  Kokkos::Device<typename NT::device_type::execution_space, \
1946  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1947  void, void >, \
1948  const Kokkos::View<LO const *, \
1949  Kokkos::Device<typename NT::device_type::execution_space, \
1950  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1951  void, void >, \
1952  size_t, \
1953  const int, \
1954  Teuchos::ArrayRCP<size_t>&, \
1955  Teuchos::ArrayRCP<GO>&, \
1956  Teuchos::ArrayRCP<ST>&, \
1957  const Teuchos::ArrayView<const int>&, \
1958  Teuchos::Array<int>&);
1959 
1960 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF( ST, LO, GO, NT ) \
1961  template void \
1962  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1963  const CrsMatrix<ST, LO, GO, NT>&, \
1964  const Teuchos::ArrayView<const char>&, \
1965  const Teuchos::ArrayView<const size_t>&, \
1966  const Teuchos::ArrayView<const LO>&, \
1967  size_t, \
1968  CombineMode); \
1969  template size_t \
1970  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1971  const CrsMatrix<ST, LO, GO, NT> &, \
1972  const Teuchos::ArrayView<const LO> &, \
1973  const Teuchos::ArrayView<const char> &, \
1974  const Teuchos::ArrayView<const size_t>&, \
1975  size_t, \
1976  CombineMode, \
1977  size_t, \
1978  const Teuchos::ArrayView<const LO>&, \
1979  const Teuchos::ArrayView<const LO>&); \
1980  template void \
1981  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1982  const CrsMatrix<ST, LO, GO, NT>&, \
1983  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1984  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1985  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1986  const size_t, \
1987  const CombineMode); \
1988  template void \
1989  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1990  const CrsMatrix<ST, LO, GO, NT> &, \
1991  const Kokkos::View<LO const *, \
1992  Kokkos::Device<typename NT::device_type::execution_space, \
1993  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1994  const Kokkos::View<const char*, \
1995  Kokkos::Device<typename NT::device_type::execution_space, \
1996  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1997  const Kokkos::View<const size_t*, \
1998  Kokkos::Device<typename NT::device_type::execution_space, \
1999  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2000  const size_t, \
2001  const Kokkos::View<LO const *, \
2002  Kokkos::Device<typename NT::device_type::execution_space, \
2003  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2004  const Kokkos::View<LO const *, \
2005  Kokkos::Device<typename NT::device_type::execution_space, \
2006  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2007  size_t, \
2008  const int, \
2009  Kokkos::View<size_t*,typename NT::device_type>&, \
2010  Kokkos::View<GO*,typename NT::device_type>&, \
2011  Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
2012  const Teuchos::ArrayView<const int>&, \
2013  Kokkos::View<int*,typename NT::device_type>&); \
2014  template void \
2015  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
2016  const CrsMatrix<ST, LO, GO, NT> &, \
2017  const Kokkos::View<LO const *, \
2018  Kokkos::Device<typename NT::device_type::execution_space, \
2019  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2020  const Kokkos::View<const char*, \
2021  Kokkos::Device<typename NT::device_type::execution_space, \
2022  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2023  const Kokkos::View<const size_t*, \
2024  Kokkos::Device<typename NT::device_type::execution_space, \
2025  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2026  const size_t, \
2027  const Kokkos::View<LO const *, \
2028  Kokkos::Device<typename NT::device_type::execution_space, \
2029  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2030  const Kokkos::View<LO const *, \
2031  Kokkos::Device<typename NT::device_type::execution_space, \
2032  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2033  size_t, \
2034  const int, \
2035  Teuchos::ArrayRCP<size_t>&, \
2036  Teuchos::ArrayRCP<GO>&, \
2037  Teuchos::ArrayRCP<ST>&, \
2038  const Teuchos::ArrayView<const int>&, \
2039  Teuchos::Array<int>&);
2040 
2041 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
2042 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
2043  TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON( ST, LO, GO, NT )
2044 #else
2045 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
2046  TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF( ST, LO, GO, NT )
2047 #endif
2048 
2049 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
&quot;Local&quot; part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...