Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
12 
13 #include <memory>
14 #include <string>
15 #include "TpetraCore_config.h"
16 #include "Kokkos_Core.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Teuchos_ArrayView.hpp"
19 #include "Teuchos_OrdinalTraits.hpp"
20 #include "Teuchos_TimeMonitor.hpp"
28 #include "Tpetra_Details_DefaultTypes.hpp"
30 
51 
52 namespace Tpetra {
53 
54 //
55 // Users must never rely on anything in the Details namespace.
56 //
57 namespace Details {
58 
59 namespace UnpackAndCombineCrsMatrixImpl {
60 
70 template<class ST, class LO, class GO>
71 KOKKOS_FUNCTION int
72 unpackRow(const typename PackTraits<GO>::output_array_type& gids_out,
73  const typename PackTraits<int>::output_array_type& pids_out,
74  const typename PackTraits<ST>::output_array_type& vals_out,
75  const char imports[],
76  const size_t offset,
77  const size_t /* num_bytes */,
78  const size_t num_ent,
79  const size_t bytes_per_value)
80 {
81  if (num_ent == 0) {
82  // Empty rows always take zero bytes, to ensure sparsity.
83  return 0;
84  }
85  bool unpack_pids = pids_out.size() > 0;
86 
87  const size_t num_ent_beg = offset;
88  const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
89 
90  const size_t gids_beg = num_ent_beg + num_ent_len;
91  const size_t gids_len =
92  num_ent * PackTraits<GO>::packValueCount (GO (0));
93 
94  const size_t pids_beg = gids_beg + gids_len;
95  const size_t pids_len = unpack_pids ?
96  size_t (num_ent * PackTraits<int>::packValueCount (int (0))) :
97  size_t (0);
98 
99  const size_t vals_beg = gids_beg + gids_len + pids_len;
100  const size_t vals_len = num_ent * bytes_per_value;
101 
102  const char* const num_ent_in = imports + num_ent_beg;
103  const char* const gids_in = imports + gids_beg;
104  const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
105  const char* const vals_in = imports + vals_beg;
106 
107  size_t num_bytes_out = 0;
108  LO num_ent_out;
109  num_bytes_out += PackTraits<LO>::unpackValue (num_ent_out, num_ent_in);
110  if (static_cast<size_t> (num_ent_out) != num_ent) {
111  return 20; // error code
112  }
113 
114  {
115  Kokkos::pair<int, size_t> p;
116  p = PackTraits<GO>::unpackArray (gids_out.data (), gids_in, num_ent);
117  if (p.first != 0) {
118  return 21; // error code
119  }
120  num_bytes_out += p.second;
121 
122  if (unpack_pids) {
123  p = PackTraits<int>::unpackArray (pids_out.data (), pids_in, num_ent);
124  if (p.first != 0) {
125  return 22; // error code
126  }
127  num_bytes_out += p.second;
128  }
129 
130  p = PackTraits<ST>::unpackArray (vals_out.data (), vals_in, num_ent);
131  if (p.first != 0) {
132  return 23; // error code
133  }
134  num_bytes_out += p.second;
135  }
136 
137  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
138  if (num_bytes_out != expected_num_bytes) {
139  return 24; // error code
140  }
141  return 0; // no errors
142 } //unpackRow
143 
154 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
156  typedef LocalMatrix local_matrix_type;
157  typedef LocalMap local_map_type;
158 
159  typedef typename local_matrix_type::value_type ST;
160  typedef typename local_map_type::local_ordinal_type LO;
161  typedef typename local_map_type::global_ordinal_type GO;
162  typedef typename local_map_type::device_type DT;
163  typedef typename DT::execution_space XS;
164 
165  typedef Kokkos::View<const size_t*, BufferDeviceType>
166  num_packets_per_lid_type;
167  typedef Kokkos::View<const size_t*, DT> offsets_type;
168  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
169  typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
170 
171  typedef Kokkos::View<int, DT> error_type;
172  using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
173 
174  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
175  "LocalMap::local_ordinal_type and "
176  "LocalMatrix::ordinal_type must be the same.");
177 
178  local_matrix_type local_matrix;
179  local_map_type local_col_map;
180  input_buffer_type imports;
181  num_packets_per_lid_type num_packets_per_lid;
182  import_lids_type import_lids;
183  Kokkos::View<const LO*[2], DT> batch_info;
184  offsets_type offsets;
185  Tpetra::CombineMode combine_mode;
186  size_t batch_size;
187  size_t bytes_per_value;
188  bool atomic;
189  error_type error_code;
190 
192  const local_matrix_type& local_matrix_in,
193  const local_map_type& local_col_map_in,
194  const input_buffer_type& imports_in,
195  const num_packets_per_lid_type& num_packets_per_lid_in,
196  const import_lids_type& import_lids_in,
197  const Kokkos::View<const LO*[2], DT>& batch_info_in,
198  const offsets_type& offsets_in,
199  const Tpetra::CombineMode combine_mode_in,
200  const size_t batch_size_in,
201  const size_t bytes_per_value_in,
202  const bool atomic_in) :
203  local_matrix (local_matrix_in),
204  local_col_map (local_col_map_in),
205  imports (imports_in),
206  num_packets_per_lid (num_packets_per_lid_in),
207  import_lids (import_lids_in),
208  batch_info (batch_info_in),
209  offsets (offsets_in),
210  combine_mode (combine_mode_in),
211  batch_size (batch_size_in),
212  bytes_per_value (bytes_per_value_in),
213  atomic (atomic_in),
214  error_code("error")
215  {}
216 
217  KOKKOS_INLINE_FUNCTION
218  void operator()(member_type team_member) const
219  {
220  using Kokkos::View;
221  using Kokkos::subview;
222  using Kokkos::MemoryUnmanaged;
223 
224  const LO batch = team_member.league_rank();
225  const LO lid_no = batch_info(batch, 0);
226  const LO batch_no = batch_info(batch, 1);
227 
228  const size_t num_bytes = num_packets_per_lid(lid_no);
229 
230  // Only unpack data if there is a nonzero number of bytes.
231  if (num_bytes == 0)
232  return;
233 
234  // there is actually something in the row
235  const LO import_lid = import_lids(lid_no);
236  const size_t buf_size = imports.size();
237  const size_t offset = offsets(lid_no);
238 
239  // Get the number of entries to expect in the received data for this row.
240  LO num_ent_LO = 0;
241  const char* const in_buf = imports.data() + offset;
242  (void) PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
243  const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
244 
245  // Count the number of bytes expected to unpack
246  size_t expected_num_bytes = 0;
247  {
248  expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
249  expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
250  expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
251  }
252 
253  if (expected_num_bytes > num_bytes)
254  {
255  Kokkos::printf(
256  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
257  "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
258  (int) lid_no, (int) expected_num_bytes, (int) num_bytes
259  );
260 
261  Kokkos::atomic_compare_exchange(error_code.data(), 0, 21);
262  return;
263  }
264 
265  if (offset > buf_size || offset + num_bytes > buf_size)
266  {
267  Kokkos::printf(
268  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
269  "At row %d, the offset (%d) > buffer size (%d)\n",
270  (int) lid_no, (int) offset, (int) buf_size
271  );
272 
273  Kokkos::atomic_compare_exchange(error_code.data(), 0, 22);
274  return;
275  }
276 
277  // Determine the number of entries to unpack in this batch
278  size_t num_entries_in_batch = 0;
279  if (num_entries_in_row <= batch_size)
280  num_entries_in_batch = num_entries_in_row;
281  else if (num_entries_in_row >= (batch_no + 1) * batch_size)
282  num_entries_in_batch = batch_size;
283  else
284  num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
285 
286  const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
287  const size_t num_ent_start = offset;
288  const size_t num_ent_end = num_ent_start + bytes_per_lid;
289 
290  const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
291  const size_t gids_start = num_ent_end;
292  const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
293 
294  const size_t vals_start = gids_end;
295 
296  const size_t shift = batch_no * batch_size;
297  const char* const num_ent_in = imports.data() + num_ent_start;
298  const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
299  const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
300 
301  LO num_ent_out;
302  (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
303  if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
304  {
305  Kokkos::printf(
306  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
307  "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
308  (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
309  );
310 
311  Kokkos::atomic_compare_exchange(error_code.data(), 0, 23);
312  }
313 
314  constexpr bool matrix_has_sorted_rows = true; // see #6282
315  //Note BMK 6-22: this lambda must use capture-by-value [=] and not capture-by-ref [&].
316  //By ref triggers compiler bug in CUDA 10.
317  Kokkos::parallel_for(
318  Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
319  [=, *this](const LO& j)
320  {
321  size_t distance = 0;
322 
323  GO gid_out;
324  distance = j * bytes_per_gid;
325  (void) PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
326  auto lid_out = local_col_map.getLocalElement(gid_out);
327 
328  // Column indices come in as global indices, in case the
329  // source object's column Map differs from the target object's
330  // (this's) column Map, and must be converted local index values
331 
332  // assume that ST is default constructible
333  ST val_out;
334  distance = j * bytes_per_value;
335  (void) PackTraits<ST>::unpackValue(val_out, vals_in + distance);
336 
337  if (combine_mode == ADD) {
338  // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
339  // different threads don't touch the same row (i.e., no
340  // duplicates in incoming LIDs list).
341  const bool use_atomic_updates = atomic;
342  (void)local_matrix.sumIntoValues(
343  import_lid,
344  &lid_out,
345  1,
346  &val_out,
347  matrix_has_sorted_rows,
348  use_atomic_updates
349  );
350  } else if (combine_mode == REPLACE) {
351  // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
352  // combine mode with multiple incoming rows that touch the same
353  // target matrix entries, so we never need atomic updates.
354  const bool use_atomic_updates = false;
355  (void)local_matrix.replaceValues(
356  import_lid,
357  &lid_out,
358  1,
359  &val_out,
360  matrix_has_sorted_rows,
361  use_atomic_updates
362  );
363  } else {
364  // should never get here
365  Kokkos::printf(
366  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
367  "At row %d, an unknown error occurred during unpack\n", (int) lid_no
368  );
369  Kokkos::atomic_compare_exchange(error_code.data(), 0, 31);
370  }
371  }
372  );
373 
374  team_member.team_barrier();
375 
376  }
377 
379  int error() const {
380  auto error_code_h = Kokkos::create_mirror_view_and_copy(
381  Kokkos::HostSpace(), error_code
382  );
383  return error_code_h();
384  }
385 
386 }; //UnpackCrsMatrixAndCombineFunctor
387 
388 struct MaxNumEntTag {};
389 struct TotNumEntTag {};
390 
399 template<class LO, class DT, class BDT>
401 public:
402  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
403  typedef Kokkos::View<const size_t*, DT> offsets_type;
404  typedef Kokkos::View<const char*, BDT> input_buffer_type;
405  // This needs to be public, since it appears in the argument list of
406  // public methods (see below). Otherwise, build errors may happen.
407  typedef size_t value_type;
408 
409 private:
410  num_packets_per_lid_type num_packets_per_lid;
411  offsets_type offsets;
412  input_buffer_type imports;
413 
414 public:
415  NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
416  const offsets_type& offsets_in,
417  const input_buffer_type& imports_in) :
418  num_packets_per_lid (num_packets_per_lid_in),
419  offsets (offsets_in),
420  imports (imports_in)
421  {}
422 
423  KOKKOS_INLINE_FUNCTION void
424  operator() (const MaxNumEntTag, const LO i, value_type& update) const {
425  // Get how many entries to expect in the received data for this row.
426  const size_t num_bytes = num_packets_per_lid(i);
427  if (num_bytes > 0) {
428  LO num_ent_LO = 0; // output argument of unpackValue
429  const char* const in_buf = imports.data () + offsets(i);
430  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
431  const size_t num_ent = static_cast<size_t> (num_ent_LO);
432 
433  update = (update < num_ent) ? num_ent : update;
434  }
435  }
436 
437  KOKKOS_INLINE_FUNCTION void
438  join (const MaxNumEntTag,
439  value_type& dst,
440  const value_type& src) const
441  {
442  if (dst < src) dst = src;
443  }
444 
445  KOKKOS_INLINE_FUNCTION void
446  operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
447  // Get how many entries to expect in the received data for this row.
448  const size_t num_bytes = num_packets_per_lid(i);
449  if (num_bytes > 0) {
450  LO num_ent_LO = 0; // output argument of unpackValue
451  const char* const in_buf = imports.data () + offsets(i);
452  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
453  tot_num_ent += static_cast<size_t> (num_ent_LO);
454  }
455  }
456 }; //NumEntriesFunctor
457 
465 template<class LO, class DT, class BDT>
466 size_t
467 compute_maximum_num_entries (
468  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
469  const Kokkos::View<const size_t*, DT>& offsets,
470  const Kokkos::View<const char*, BDT>& imports)
471 {
472  typedef typename DT::execution_space XS;
473  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
474  MaxNumEntTag> range_policy;
475 
476  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
477  imports);
478  const LO numRowsToUnpack =
479  static_cast<LO> (num_packets_per_lid.extent (0));
480  size_t max_num_ent = 0;
481  Kokkos::parallel_reduce ("Max num entries in CRS",
482  range_policy (0, numRowsToUnpack),
483  functor, max_num_ent);
484  return max_num_ent;
485 }
486 
494 template<class LO, class DT, class BDT>
495 size_t
496 compute_total_num_entries (
497  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
498  const Kokkos::View<const size_t*, DT>& offsets,
499  const Kokkos::View<const char*, BDT>& imports)
500 {
501  typedef typename DT::execution_space XS;
502  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
503  size_t tot_num_ent = 0;
504  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
505  imports);
506  const LO numRowsToUnpack =
507  static_cast<LO> (num_packets_per_lid.extent (0));
508  Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
509  range_policy (0, numRowsToUnpack),
510  functor, tot_num_ent);
511  return tot_num_ent;
512 }
513 
514 template<class LO>
515 KOKKOS_INLINE_FUNCTION
516 size_t
517 unpackRowCount(const char imports[],
518  const size_t offset,
519  const size_t num_bytes)
520 {
521  using PT = PackTraits<LO>;
522 
523  LO num_ent_LO = 0;
524  if (num_bytes > 0) {
525  const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
526  if (p_num_bytes > num_bytes) {
527  return OrdinalTraits<size_t>::invalid();
528  }
529  const char* const in_buf = imports + offset;
530  (void) PT::unpackValue(num_ent_LO, in_buf);
531  }
532  return static_cast<size_t>(num_ent_LO);
533 }
534 
539 template<class View1, class View2>
540 inline
541 bool
542 compute_batch_info(
543  const View1& batches_per_lid,
544  View2& batch_info
545 )
546 {
547  using LO = typename View2::value_type;
548  size_t batch = 0;
549  for (size_t i=0; i<batches_per_lid.extent(0); i++)
550  {
551  for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
552  {
553  batch_info(batch, 0) = static_cast<LO>(i);
554  batch_info(batch, 1) = batch_no;
555  batch++;
556  }
557  }
558  return batch == batch_info.extent(0);
559 }
560 
568 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
569 void
570 unpackAndCombineIntoCrsMatrix(
571  const LocalMatrix& local_matrix,
572  const LocalMap& local_map,
573  const Kokkos::View<const char*, BufferDeviceType>& imports,
574  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
575  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
576  const Tpetra::CombineMode combine_mode)
577 {
578  using ST = typename LocalMatrix::value_type;
579  using LO = typename LocalMap::local_ordinal_type;
580  using DT = typename LocalMap::device_type;
581  using XS = typename DT::execution_space;
582  const char prefix[] =
583  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
584  "unpackAndCombineIntoCrsMatrix: ";
585 
586  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
587  if (num_import_lids == 0) {
588  // Nothing to unpack
589  return;
590  }
591 
592  {
593  // Check for correct input
594  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
595  std::invalid_argument,
596  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
597  "static graph (i.e., was constructed with the CrsMatrix constructor "
598  "that takes a const CrsGraph pointer).");
599 
600  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
601  std::invalid_argument,
602  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
603  "(i.e., was constructed with the CrsMatrix constructor that takes a "
604  "const CrsGraph pointer).");
605 
606  // Unknown combine mode!
607  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
608  std::invalid_argument,
609  prefix << "Invalid combine mode; should never get "
610  "here! Please report this bug to the Tpetra developers.");
611 
612  // Check that sizes of input objects are consistent.
613  bool bad_num_import_lids =
614  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
615  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
616  std::invalid_argument,
617  prefix << "importLIDs.size() (" << num_import_lids << ") != "
618  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
619  } // end QA error checking
620 
621  // Get the offsets
622  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
623  computeOffsetsFromCounts(offsets, num_packets_per_lid);
624 
625  // Determine the sizes of the unpack batches
626  size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
627  const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
628  const size_t batch_size = std::min(default_batch_size, max_num_ent);
629 
630  // To achieve some balance amongst threads, unpack each row in equal size batches
631  size_t num_batches = 0;
632  Kokkos::View<LO*[2], DT> batch_info("", num_batches);
633  Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
634  // Compute meta data that allows batch unpacking
635  Kokkos::parallel_reduce(
636  Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
637  KOKKOS_LAMBDA(const size_t i, size_t& batches)
638  {
639  const size_t num_entries_in_row = unpackRowCount<LO>(
640  imports.data(), offsets(i), num_packets_per_lid(i)
641  );
642  batches_per_lid(i) =
643  (num_entries_in_row <= batch_size) ?
644  1 :
645  num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
646  batches += batches_per_lid(i);
647  },
648  num_batches
649  );
650  Kokkos::resize(batch_info, num_batches);
651 
652  Kokkos::HostSpace host_space;
653  auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
654  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
655  Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
656 
657  auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
658 
659  (void) compute_batch_info(batches_per_lid_h, batch_info_h);
660  // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
661  Kokkos::deep_copy(XS(), batch_info, batch_info_h);
662 
663  // FIXME (TJF SEP 2017)
664  // The scalar type is not necessarily default constructible
665  size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
666 
667  // Now do the actual unpack!
668  const bool atomic = XS().concurrency() != 1;
669  using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
670  functor f(
671  local_matrix,
672  local_map,
673  imports,
674  num_packets_per_lid,
675  import_lids,
676  batch_info,
677  offsets,
678  combine_mode,
679  batch_size,
680  bytes_per_value,
681  atomic
682  );
683 
684  using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
686  if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
687  {
688  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
689  }
690  else
691  {
692  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
693  }
694 
695  auto error_code = f.error();
696  TEUCHOS_TEST_FOR_EXCEPTION(
697  error_code != 0,
698  std::runtime_error,
699  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
700  );
701 } //unpackAndCombineIntoCrsMatrix (Kokkos version)
702 
703 template<class LocalMatrix, class BufferDeviceType>
704 size_t
706  const LocalMatrix& local_matrix,
707  const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
708 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
709  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
710  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
711 #else
712  const Kokkos::View<const char*, BufferDeviceType>& imports,
713  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
714 #endif
715  const size_t num_same_ids)
716 {
717  using Kokkos::parallel_reduce;
718  typedef typename LocalMatrix::ordinal_type LO;
719  typedef typename LocalMatrix::device_type device_type;
720  typedef typename device_type::execution_space XS;
721  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
722  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
723  typedef BufferDeviceType BDT;
724 
725  size_t count = 0;
726  LO num_items;
727 
728  // Number of matrix entries to unpack (returned by this function).
729  num_items = static_cast<LO>(num_same_ids);
730  if (num_items) {
731  size_t kcnt = 0;
732  parallel_reduce(range_policy(0, num_items),
733  KOKKOS_LAMBDA(const LO lid, size_t& update) {
734  update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
735  -local_matrix.graph.row_map[lid]);
736  }, kcnt);
737  count += kcnt;
738  }
739 
740  // Count entries copied directly from the source matrix with permuting.
741  num_items = static_cast<LO>(permute_from_lids.extent(0));
742  if (num_items) {
743  size_t kcnt = 0;
744  parallel_reduce(range_policy(0, num_items),
745  KOKKOS_LAMBDA(const LO i, size_t& update) {
746  const LO lid = permute_from_lids(i);
747  update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
748  - local_matrix.graph.row_map[lid]);
749  }, kcnt);
750  count += kcnt;
751  }
752 
753  {
754  // Count entries received from other MPI processes.
755  const size_type np = num_packets_per_lid.extent(0);
756  Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
757  computeOffsetsFromCounts(offsets, num_packets_per_lid);
758  count +=
759  compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
760  offsets, imports);
761  }
762 
763  return count;
764 } //unpackAndCombineWithOwningPIDsCount (Kokkos version)
765 
767 template<class LO, class DT, class BDT>
768 int
769 setupRowPointersForRemotes(
770  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
771  const typename PackTraits<LO>::input_array_type& import_lids,
772  const Kokkos::View<const char*, BDT>& imports,
773  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
774  const typename PackTraits<size_t>::input_array_type& offsets)
775 {
776  using Kokkos::parallel_reduce;
777  typedef typename DT::execution_space XS;
778  typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
779  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
780 
781  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
782  const size_type N = num_packets_per_lid.extent(0);
783 
784  int errors = 0;
785  parallel_reduce ("Setup row pointers for remotes",
786  range_policy (0, N),
787  KOKKOS_LAMBDA (const size_t i, int& k_error) {
788  typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
789  const size_t num_bytes = num_packets_per_lid(i);
790  const size_t offset = offsets(i);
791  const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
792  if (num_ent == InvalidNum) {
793  k_error += 1;
794  }
795  Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
796  }, errors);
797  return errors;
798 }
799 
800 // Convert array of row lengths to a CRS pointer array
801 template<class DT>
802 void
803 makeCrsRowPtrFromLengths(
804  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
805  const Kokkos::View<size_t*,DT>& new_start_row)
806 {
807  using Kokkos::parallel_scan;
808  typedef typename DT::execution_space XS;
809  typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
810  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
811  const size_type N = new_start_row.extent(0);
812  parallel_scan(range_policy(0, N),
813  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
814  auto cur_val = tgt_rowptr(i);
815  if (final) {
816  tgt_rowptr(i) = update;
817  new_start_row(i) = tgt_rowptr(i);
818  }
819  update += cur_val;
820  }
821  );
822 }
823 
824 template<class LocalMatrix, class LocalMap>
825 void
826 copyDataFromSameIDs(
827  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
828  const typename PackTraits<int>::output_array_type& tgt_pids,
829  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
830  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
831  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
832  const typename PackTraits<int>::input_array_type& src_pids,
833  const LocalMatrix& local_matrix,
834  const LocalMap& local_col_map,
835  const size_t num_same_ids,
836  const int my_pid)
837 {
838  using Kokkos::parallel_for;
839  typedef typename LocalMap::device_type DT;
840  typedef typename LocalMap::local_ordinal_type LO;
841  typedef typename DT::execution_space XS;
842  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
843 
844  parallel_for(range_policy(0, num_same_ids),
845  KOKKOS_LAMBDA(const size_t i) {
846  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
847 
848  const LO src_lid = static_cast<LO>(i);
849  size_t src_row = local_matrix.graph.row_map(src_lid);
850 
851  const LO tgt_lid = static_cast<LO>(i);
852  const size_t tgt_row = tgt_rowptr(tgt_lid);
853 
854  const size_t nsr = local_matrix.graph.row_map(src_lid+1)
855  - local_matrix.graph.row_map(src_lid);
856  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
857 
858  for (size_t j=local_matrix.graph.row_map(src_lid);
859  j<local_matrix.graph.row_map(src_lid+1); ++j) {
860  LO src_col = local_matrix.graph.entries(j);
861  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
862  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
863  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
864  }
865  }
866  );
867 }
868 
869 template<class LocalMatrix, class LocalMap>
870 void
871 copyDataFromPermuteIDs(
872  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
873  const typename PackTraits<int>::output_array_type& tgt_pids,
874  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
875  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
876  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
877  const typename PackTraits<int>::input_array_type& src_pids,
878  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
879  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
880  const LocalMatrix& local_matrix,
881  const LocalMap& local_col_map,
882  const int my_pid)
883 {
884  using Kokkos::parallel_for;
885  typedef typename LocalMap::device_type DT;
886  typedef typename LocalMap::local_ordinal_type LO;
887  typedef typename DT::execution_space XS;
888  typedef typename PackTraits<LO>::input_array_type::size_type size_type;
889  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
890 
891  const size_type num_permute_to_lids = permute_to_lids.extent(0);
892 
893  parallel_for(range_policy(0, num_permute_to_lids),
894  KOKKOS_LAMBDA(const size_t i) {
895  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
896 
897  const LO src_lid = permute_from_lids(i);
898  const size_t src_row = local_matrix.graph.row_map(src_lid);
899 
900  const LO tgt_lid = permute_to_lids(i);
901  const size_t tgt_row = tgt_rowptr(tgt_lid);
902 
903  size_t nsr = local_matrix.graph.row_map(src_lid+1)
904  - local_matrix.graph.row_map(src_lid);
905  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
906 
907  for (size_t j=local_matrix.graph.row_map(src_lid);
908  j<local_matrix.graph.row_map(src_lid+1); ++j) {
909  LO src_col = local_matrix.graph.entries(j);
910  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
911  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
912  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
913  }
914  }
915  );
916 }
917 
918 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
919 int
920 unpackAndCombineIntoCrsArrays2(
921  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
922  const typename PackTraits<int>::output_array_type& tgt_pids,
923  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
924  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
925  const typename PackTraits<size_t>::input_array_type& offsets,
926  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
927 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
928  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
929  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
930 #else
931  const Kokkos::View<const char*, BufferDeviceType>& imports,
932  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
933 #endif
934  const LocalMatrix& /* local_matrix */,
935  const LocalMap /*& local_col_map*/,
936  const int my_pid,
937  const size_t bytes_per_value)
938 {
939  using Kokkos::View;
940  using Kokkos::subview;
941  using Kokkos::MemoryUnmanaged;
942  using Kokkos::parallel_reduce;
943  using Kokkos::atomic_fetch_add;
945  typedef typename LocalMap::device_type DT;
946  typedef typename LocalMap::local_ordinal_type LO;
947  typedef typename LocalMap::global_ordinal_type GO;
948  typedef typename LocalMatrix::value_type ST;
949  typedef typename DT::execution_space XS;
950  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
951  typedef typename Kokkos::pair<size_type, size_type> slice;
952  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
953 
954  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
955  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
956  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
957 
958  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
959 
960  int errors = 0;
961  const size_type num_import_lids = import_lids.size();
962 
963  // RemoteIDs: Loop structure following UnpackAndCombine
964  parallel_reduce ("Unpack and combine into CRS",
965  range_policy (0, num_import_lids),
966  KOKKOS_LAMBDA (const size_t i, int& k_error) {
967  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
968  const size_t num_bytes = num_packets_per_lid(i);
969  const size_t offset = offsets(i);
970  if (num_bytes == 0) {
971  // Empty buffer means that the row is empty.
972  return;
973  }
974  size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
975  if (num_ent == InvalidNum) {
976  k_error += 1;
977  return;
978  }
979  const LO lcl_row = import_lids(i);
980  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
981  const size_t end_row = start_row + num_ent;
982 
983  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
984  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
985  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
986 
987  k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
988  imports.data(), offset, num_bytes,
989  num_ent, bytes_per_value);
990 
991  // Correct target PIDs.
992  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
993  const int pid = pids_out(j);
994  pids_out(j) = (pid != my_pid) ? pid : -1;
995  }
996  }, errors);
997 
998  return errors;
999 }
1000 
1001 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1002 void
1004  const LocalMatrix & local_matrix,
1005  const LocalMap & local_col_map,
1006  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1007 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1008  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1009  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1010 #else
1011  const Kokkos::View<const char*, BufferDeviceType>& imports,
1012  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1013 #endif
1014  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1015  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1016  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1017  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1018  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1019  const typename PackTraits<int>::input_array_type& src_pids,
1020  const typename PackTraits<int>::output_array_type& tgt_pids,
1021  const size_t num_same_ids,
1022  const size_t tgt_num_rows,
1023  const size_t tgt_num_nonzeros,
1024  const int my_tgt_pid,
1025  const size_t bytes_per_value)
1026 {
1027  using Kokkos::View;
1028  using Kokkos::subview;
1029  using Kokkos::parallel_for;
1030  using Kokkos::MemoryUnmanaged;
1032  typedef typename LocalMap::device_type DT;
1033  typedef typename LocalMap::local_ordinal_type LO;
1034  typedef typename DT::execution_space XS;
1035  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1036  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1037  typedef BufferDeviceType BDT;
1038 
1039  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1040 
1041  const size_t N = tgt_num_rows;
1042 
1043  // In the case of reduced communicators, the sourceMatrix won't have
1044  // the right "my_pid", so thus we have to supply it.
1045  const int my_pid = my_tgt_pid;
1046 
1047  // Zero the rowptr
1048  parallel_for(range_policy(0, N+1),
1049  KOKKOS_LAMBDA(const size_t i) {
1050  tgt_rowptr(i) = 0;
1051  }
1052  );
1053 
1054  // same IDs: Always first, always in the same place
1055  parallel_for(range_policy(0, num_same_ids),
1056  KOKKOS_LAMBDA(const size_t i) {
1057  const LO tgt_lid = static_cast<LO>(i);
1058  const LO src_lid = static_cast<LO>(i);
1059  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1060  - local_matrix.graph.row_map(src_lid);
1061  }
1062  );
1063 
1064  // Permute IDs: Still local, but reordered
1065  const size_type num_permute_to_lids = permute_to_lids.extent(0);
1066  parallel_for(range_policy(0, num_permute_to_lids),
1067  KOKKOS_LAMBDA(const size_t i) {
1068  const LO tgt_lid = permute_to_lids(i);
1069  const LO src_lid = permute_from_lids(i);
1070  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1071  - local_matrix.graph.row_map(src_lid);
1072  }
1073  );
1074 
1075  // Get the offsets from the number of packets per LID
1076  const size_type num_import_lids = import_lids.extent(0);
1077  View<size_t*, DT> offsets("offsets", num_import_lids+1);
1078  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1079 
1080 #ifdef HAVE_TPETRA_DEBUG
1081  {
1082  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1083  const bool condition =
1084  nth_offset_h != static_cast<size_t>(imports.extent (0));
1085  TEUCHOS_TEST_FOR_EXCEPTION
1086  (condition, std::logic_error, prefix
1087  << "The final offset in bytes " << nth_offset_h
1088  << " != imports.size() = " << imports.extent(0)
1089  << ". Please report this bug to the Tpetra developers.");
1090  }
1091 #endif // HAVE_TPETRA_DEBUG
1092 
1093  // Setup row pointers for remotes
1094  int k_error =
1095  setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1096  import_lids, imports, num_packets_per_lid, offsets);
1097  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1098  << " Error transferring data to target row pointers. "
1099  "Please report this bug to the Tpetra developers.");
1100 
1101  // If multiple processes contribute to the same row, we may need to
1102  // update row offsets. This tracks that.
1103  View<size_t*, DT> new_start_row ("new_start_row", N+1);
1104 
1105  // Turn row length into a real CRS row pointer
1106  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1107 
1108  // SameIDs: Copy the data over
1109  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1110  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1111 
1112  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1113  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1114  local_matrix, local_col_map, my_pid);
1115 
1116  if (imports.extent(0) <= 0) {
1117  return;
1118  }
1119 
1120  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1121  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1122  local_matrix, local_col_map, my_pid, bytes_per_value);
1123  TEUCHOS_TEST_FOR_EXCEPTION(
1124  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1125  "should never happen. Please report this bug to the Tpetra developers.");
1126 
1127  return;
1128 }
1129 
1130 } // namespace UnpackAndCombineCrsMatrixImpl
1131 
1171 template<typename ST, typename LO, typename GO, typename Node>
1172 void
1174  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1175  const Teuchos::ArrayView<const char>& imports,
1176  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1177  const Teuchos::ArrayView<const LO>& importLIDs,
1178  size_t /* constantNumPackets */,
1179  CombineMode combineMode)
1180 {
1181  using Kokkos::View;
1182  typedef typename Node::device_type device_type;
1183  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1184  static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1185  "Node::device_type and LocalMatrix::device_type must be the same.");
1186 
1187  // Convert all Teuchos::Array to Kokkos::View.
1188  device_type outputDevice;
1189 
1190  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1191  // them to device. Since unpacking is done directly in to the local matrix
1192  // (lclMatrix), no copying needs to be performed after unpacking.
1193  auto num_packets_per_lid_d =
1194  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1195  numPacketsPerLID.size(), true, "num_packets_per_lid");
1196 
1197  auto import_lids_d =
1198  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1199  importLIDs.size(), true, "import_lids");
1200 
1201  auto imports_d =
1202  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1203  imports.size(), true, "imports");
1204 
1205  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1206  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1207 
1208 //KDDKDD This loop doesn't appear to do anything; what is it?
1209 //KDDKDD for (int i=0; i<importLIDs.size(); i++)
1210 //KDDKDD {
1211 //KDDKDD auto lclRow = importLIDs[i];
1212 //KDDKDD Teuchos::ArrayView<const LO> A_indices;
1213 //KDDKDD Teuchos::ArrayView<const ST> A_values;
1214 //KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1215 //KDDKDD }
1216  // Now do the actual unpack!
1217  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1218  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1219  import_lids_d, combineMode);
1220 
1221 }
1222 
1223 template<typename ST, typename LO, typename GO, typename NT>
1224 void
1225 unpackCrsMatrixAndCombineNew(
1226  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1227  Kokkos::DualView<char*,
1229  Kokkos::DualView<size_t*,
1230  typename DistObject<char, LO, GO, NT>::buffer_device_type> numPacketsPerLID,
1231  const Kokkos::DualView<const LO*,
1233  const size_t /* constantNumPackets */,
1234  const CombineMode combineMode)
1235 {
1236  using Kokkos::View;
1237  using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1238  using dist_object_type = DistObject<char, LO, GO, NT>;
1239  using device_type = typename crs_matrix_type::device_type;
1240  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1241  using buffer_device_type = typename dist_object_type::buffer_device_type;
1242 
1243  static_assert
1244  (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1245  "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1246  "must be the same.");
1247 
1248  if (numPacketsPerLID.need_sync_device()) {
1249  numPacketsPerLID.sync_device ();
1250  }
1251  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1252 
1253  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1254  auto import_lids_d = importLIDs.view_device ();
1255 
1256  if (imports.need_sync_device()) {
1257  imports.sync_device ();
1258  }
1259  auto imports_d = imports.view_device ();
1260 
1261  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1262  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1263  typedef decltype (local_col_map) local_map_type;
1264 
1265  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1266  local_matrix_device_type,
1267  local_map_type,
1268  buffer_device_type
1269  > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1270  import_lids_d, combineMode);
1271 }
1272 
1319 //
1328 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1329 size_t
1331  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1332  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1333  const Teuchos::ArrayView<const char> &imports,
1334  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1335  size_t /* constantNumPackets */,
1336  CombineMode /* combineMode */,
1337  size_t numSameIDs,
1338  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1339  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1340 {
1341  using Kokkos::MemoryUnmanaged;
1342  using Kokkos::View;
1343  typedef typename Node::device_type DT;
1344  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1345 
1346  TEUCHOS_TEST_FOR_EXCEPTION
1347  (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1348  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1349  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1350  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1351  // process, then the matrix is neither locally nor globally indexed.
1352  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1353  TEUCHOS_TEST_FOR_EXCEPTION
1354  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1355  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1356  TEUCHOS_TEST_FOR_EXCEPTION
1357  (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1358  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1359  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1360 
1361  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1362 
1363  using kokkos_device_type = Kokkos::Device<typename Node::device_type::execution_space,
1364  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1365 
1366 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1367  Kokkos::View<LocalOrdinal const *, kokkos_device_type, void, void > permute_from_lids_d =
1368 #else
1369  Kokkos::View<LocalOrdinal const *, kokkos_device_type> permute_from_lids_d =
1370 #endif
1372  permuteFromLIDs.getRawPtr (),
1373  permuteFromLIDs.size (), true,
1374  "permute_from_lids");
1375 
1376 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1377  Kokkos::View<const char*, kokkos_device_type, void, void > imports_d =
1378 #else
1379  Kokkos::View<const char*, kokkos_device_type> imports_d =
1380 #endif
1382  imports.getRawPtr (),
1383  imports.size (), true,
1384  "imports");
1385 
1386 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1387  Kokkos::View<const size_t*, kokkos_device_type, void, void > num_packets_per_lid_d =
1388 #else
1389  Kokkos::View<const size_t*, kokkos_device_type> num_packets_per_lid_d =
1390 #endif
1392  numPacketsPerLID.getRawPtr (),
1393  numPacketsPerLID.size (), true,
1394  "num_packets_per_lid");
1395 
1397  local_matrix, permute_from_lids_d, imports_d,
1398  num_packets_per_lid_d, numSameIDs);
1399 } //unpackAndCombineWithOwningPIDsCount (Teuchos::Array version)
1400 
1415 
1416 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1417 void
1420  const Kokkos::View<LocalOrdinal const *,
1421  Kokkos::Device<typename Node::device_type::execution_space,
1422  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1423 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1424  , void, void
1425 #endif
1426  > import_lids_d,
1427  const Kokkos::View<const char*,
1428  Kokkos::Device<typename Node::device_type::execution_space,
1429  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1430 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1431  , void, void
1432 #endif
1433  > imports_d,
1434  const Kokkos::View<const size_t*,
1435  Kokkos::Device<typename Node::device_type::execution_space,
1436  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1437 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1438  , void, void
1439 #endif
1440  > num_packets_per_lid_d,
1441  const size_t numSameIDs,
1442  const Kokkos::View<LocalOrdinal const *,
1443  Kokkos::Device<typename Node::device_type::execution_space,
1444  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1445 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1446  , void, void
1447 #endif
1448  > permute_to_lids_d,
1449  const Kokkos::View<LocalOrdinal const *,
1450  Kokkos::Device<typename Node::device_type::execution_space,
1451  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1452 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1453  , void, void
1454 #endif
1455  > permute_from_lids_d,
1456  size_t TargetNumRows,
1457  const int MyTargetPID,
1458  Kokkos::View<size_t*,typename Node::device_type> &crs_rowptr_d,
1459  Kokkos::View<GlobalOrdinal*,typename Node::device_type> &crs_colind_d,
1460  Kokkos::View<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*,typename Node::device_type>& crs_vals_d,
1461  const Teuchos::ArrayView<const int>& SourcePids,
1462  Kokkos::View<int*,typename Node::device_type> &TargetPids)
1463 {
1464  using execution_space = typename Node::execution_space;
1466 
1467  using Kokkos::View;
1468  using Kokkos::deep_copy;
1469 
1470  using Teuchos::ArrayView;
1471  using Teuchos::outArg;
1472  using Teuchos::REDUCE_MAX;
1473  using Teuchos::reduceAll;
1474 
1475  typedef typename Node::device_type DT;
1476 
1478  typedef typename matrix_type::impl_scalar_type ST;
1479 
1480  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1481 # ifdef HAVE_TPETRA_MMM_TIMINGS
1482  using Teuchos::TimeMonitor;
1483  Teuchos::RCP<TimeMonitor> tm;
1484 # endif
1485 
1486  using Kokkos::MemoryUnmanaged;
1487 
1488  TEUCHOS_TEST_FOR_EXCEPTION
1489  (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1490  prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != "
1491  "permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1492  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1493  // process, then the matrix is neither locally nor globally indexed.
1494  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1495  TEUCHOS_TEST_FOR_EXCEPTION
1496  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1497  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1498  TEUCHOS_TEST_FOR_EXCEPTION
1499  (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1500  prefix << "import_lids_d.size() = " << import_lids_d.size () << " != "
1501  "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << ".");
1502 
1503  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1504 
1505  // TargetNumNonzeros is number of nonzeros in local matrix.
1506 # ifdef HAVE_TPETRA_MMM_TIMINGS
1507  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1508 # endif
1509  size_t TargetNumNonzeros =
1511  local_matrix, permute_from_lids_d, imports_d,
1512  num_packets_per_lid_d, numSameIDs);
1513 # ifdef HAVE_TPETRA_MMM_TIMINGS
1514  tm = Teuchos::null;
1515 # endif
1516 
1517 # ifdef HAVE_TPETRA_MMM_TIMINGS
1518  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1519 # endif
1520  Kokkos::resize(crs_rowptr_d,TargetNumRows+1);
1521  Kokkos::resize(crs_colind_d,TargetNumNonzeros);
1522  Kokkos::resize(crs_vals_d,TargetNumNonzeros);
1523 # ifdef HAVE_TPETRA_MMM_TIMINGS
1524  tm = Teuchos::null;
1525 # endif
1526 
1527  TEUCHOS_TEST_FOR_EXCEPTION(
1528  permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1529  prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size ()
1530  << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << ".");
1531 
1532  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1533  Kokkos::resize(TargetPids,TargetNumNonzeros);
1534  }
1535  Kokkos::deep_copy(execution_space(), TargetPids, -1);
1536 
1537  // Grab pointers for sourceMatrix
1538  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1539 
1540 # ifdef HAVE_TPETRA_MMM_TIMINGS
1541  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1542 # endif
1543  // Convert input arrays to Kokkos::Views
1544  DT outputDevice;
1545 
1546  auto src_pids_d =
1547  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1548  SourcePids.size(), true, "src_pids");
1549 
1550 # ifdef HAVE_TPETRA_MMM_TIMINGS
1551  tm = Teuchos::null;
1552 # endif
1553 
1554  size_t bytes_per_value = 0;
1556  // assume that ST is default constructible
1557  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1558  }
1559  else {
1560  // Since the packed data come from the source matrix, we can use the source
1561  // matrix to get the number of bytes per Scalar value stored in the matrix.
1562  // This assumes that all Scalar values in the source matrix require the same
1563  // number of bytes. If the source matrix has no entries on the calling
1564  // process, then we hope that some process does have some idea how big
1565  // a Scalar value is. Of course, if no processes have any entries, then no
1566  // values should be packed (though this does assume that in our packing
1567  // scheme, rows with zero entries take zero bytes).
1568  size_t bytes_per_value_l = 0;
1569  if (local_matrix.values.extent(0) > 0) {
1570  const ST& val = local_matrix.values(0);
1571  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1572  } else {
1573  const ST& val = crs_vals_d(0);
1574  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1575  }
1576  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1577  Teuchos::REDUCE_MAX,
1578  bytes_per_value_l,
1579  outArg(bytes_per_value));
1580  }
1581 
1582 # ifdef HAVE_TPETRA_MMM_TIMINGS
1583  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1584 # endif
1586  local_matrix, local_col_map, import_lids_d, imports_d,
1587  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1588  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1589  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1590  bytes_per_value);
1591 # ifdef HAVE_TPETRA_MMM_TIMINGS
1592  tm = Teuchos::null;
1593 # endif
1594 
1595  // Copy outputs back to host
1596 # ifdef HAVE_TPETRA_MMM_TIMINGS
1597  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1598 # endif
1599 
1600  Kokkos::parallel_for("setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,TargetPids.size()), KOKKOS_LAMBDA (const size_t i) {
1601  if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1602  });
1603 
1604 } //unpackAndCombineIntoCrsArrays
1605 
1606 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1607 void
1610  const Kokkos::View<LocalOrdinal const *,
1611  Kokkos::Device<typename Node::device_type::execution_space,
1612  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1613 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1614  , void, void
1615 #endif
1616  > import_lids_d,
1617  const Kokkos::View<const char*,
1618  Kokkos::Device<typename Node::device_type::execution_space,
1619  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1620 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1621  , void, void
1622 #endif
1623  > imports_d,
1624  const Kokkos::View<const size_t*,
1625  Kokkos::Device<typename Node::device_type::execution_space,
1626  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1627 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1628  , void, void
1629 #endif
1630  > num_packets_per_lid_d,
1631  const size_t numSameIDs,
1632  const Kokkos::View<LocalOrdinal const *,
1633  Kokkos::Device<typename Node::device_type::execution_space,
1634  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1635 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1636  , void, void
1637 #endif
1638  > permute_to_lids_d,
1639  const Kokkos::View<LocalOrdinal const *,
1640  Kokkos::Device<typename Node::device_type::execution_space,
1641  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1642 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1643  , void, void
1644 #endif
1645  > permute_from_lids_d,
1646  size_t TargetNumRows,
1647  const int MyTargetPID,
1648  Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1649  Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1650  Teuchos::ArrayRCP<Scalar>& CRS_vals,
1651  const Teuchos::ArrayView<const int>& SourcePids,
1652  Teuchos::Array<int>& TargetPids)
1653 {
1654  using execution_space = typename Node::execution_space;
1656 
1657  using Kokkos::View;
1658  using Kokkos::deep_copy;
1659 
1660  using Teuchos::ArrayView;
1661  using Teuchos::outArg;
1662  using Teuchos::REDUCE_MAX;
1663  using Teuchos::reduceAll;
1664 
1665  typedef typename Node::device_type DT;
1666 
1668  typedef typename matrix_type::impl_scalar_type ST;
1669 
1670  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1671 # ifdef HAVE_TPETRA_MMM_TIMINGS
1672  using Teuchos::TimeMonitor;
1673  Teuchos::RCP<TimeMonitor> tm;
1674 # endif
1675 
1676  using Kokkos::MemoryUnmanaged;
1677 
1678  TEUCHOS_TEST_FOR_EXCEPTION
1679  (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1680  prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != "
1681  "permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1682  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1683  // process, then the matrix is neither locally nor globally indexed.
1684  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1685  TEUCHOS_TEST_FOR_EXCEPTION
1686  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1687  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1688  TEUCHOS_TEST_FOR_EXCEPTION
1689  (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1690  prefix << "import_lids_d.size() = " << import_lids_d.size () << " != "
1691  "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << ".");
1692 
1693  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1694 
1695  // TargetNumNonzeros is number of nonzeros in local matrix.
1696 # ifdef HAVE_TPETRA_MMM_TIMINGS
1697  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1698 # endif
1699  size_t TargetNumNonzeros =
1701  local_matrix, permute_from_lids_d, imports_d,
1702  num_packets_per_lid_d, numSameIDs);
1703 # ifdef HAVE_TPETRA_MMM_TIMINGS
1704  tm = Teuchos::null;
1705 # endif
1706 
1707 # ifdef HAVE_TPETRA_MMM_TIMINGS
1708  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1709 # endif
1710  CRS_rowptr.resize (TargetNumRows+1);
1711  CRS_colind.resize(TargetNumNonzeros);
1712  CRS_vals.resize(TargetNumNonzeros);
1713  Teuchos::ArrayRCP<ST> const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1714 # ifdef HAVE_TPETRA_MMM_TIMINGS
1715  tm = Teuchos::null;
1716 # endif
1717 
1718  TEUCHOS_TEST_FOR_EXCEPTION(
1719  permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1720  prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size ()
1721  << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << ".");
1722 
1723  // Preseed TargetPids with -1 for local
1724  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1725  TargetPids.resize (TargetNumNonzeros);
1726  }
1727  TargetPids.assign (TargetNumNonzeros, -1);
1728 
1729  // Grab pointers for sourceMatrix
1730  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1731 
1732 # ifdef HAVE_TPETRA_MMM_TIMINGS
1733  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1734 # endif
1735  // Convert input arrays to Kokkos::Views
1736  DT outputDevice;
1737 
1738  auto crs_rowptr_d =
1739  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1740  CRS_rowptr.size(), true, "crs_rowptr");
1741 
1742  auto crs_colind_d =
1743  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1744  CRS_colind.size(), true, "crs_colidx");
1745 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1746  static_assert (! std::is_same<
1747  typename std::remove_const<
1748  typename std::decay<
1749  decltype (CRS_vals_impl_scalar_type)
1750  >::type::value_type
1751  >::type,
1752  std::complex<double> >::value,
1753  "CRS_vals::value_type is std::complex<double>; this should never happen"
1754  ", since std::complex does not work in Kokkos::View objects.");
1755 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1756 
1757  auto crs_vals_d =
1758  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals_impl_scalar_type.getRawPtr(),
1759  CRS_vals_impl_scalar_type.size(), true, "crs_vals");
1760 
1761 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1762  static_assert (! std::is_same<
1763  typename decltype (crs_vals_d)::non_const_value_type,
1764  std::complex<double> >::value,
1765  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1766  "never happen, since std::complex does not work in Kokkos::View objects.");
1767 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1768 
1769  auto src_pids_d =
1770  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1771  SourcePids.size(), true, "src_pids");
1772 
1773  auto tgt_pids_d =
1774  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1775  TargetPids.size(), true, "tgt_pids");
1776 
1777 # ifdef HAVE_TPETRA_MMM_TIMINGS
1778  tm = Teuchos::null;
1779 # endif
1780 
1781  size_t bytes_per_value = 0;
1783  // assume that ST is default constructible
1784  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1785  }
1786  else {
1787  // Since the packed data come from the source matrix, we can use the source
1788  // matrix to get the number of bytes per Scalar value stored in the matrix.
1789  // This assumes that all Scalar values in the source matrix require the same
1790  // number of bytes. If the source matrix has no entries on the calling
1791  // process, then we hope that some process does have some idea how big
1792  // a Scalar value is. Of course, if no processes have any entries, then no
1793  // values should be packed (though this does assume that in our packing
1794  // scheme, rows with zero entries take zero bytes).
1795  size_t bytes_per_value_l = 0;
1796  if (local_matrix.values.extent(0) > 0) {
1797  const ST& val = local_matrix.values(0);
1798  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1799  } else {
1800  const ST& val = crs_vals_d(0);
1801  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1802  }
1803  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1804  Teuchos::REDUCE_MAX,
1805  bytes_per_value_l,
1806  outArg(bytes_per_value));
1807  }
1808 
1809 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1810  static_assert (! std::is_same<
1811  typename decltype (crs_vals_d)::non_const_value_type,
1812  std::complex<double> >::value,
1813  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1814  "never happen, since std::complex does not work in Kokkos::View objects.");
1815 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1816 
1817 # ifdef HAVE_TPETRA_MMM_TIMINGS
1818  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1819 # endif
1821  local_matrix, local_col_map, import_lids_d, imports_d,
1822  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1823  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1824  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1825  bytes_per_value);
1826 # ifdef HAVE_TPETRA_MMM_TIMINGS
1827  tm = Teuchos::null;
1828 # endif
1829 
1830  // Copy outputs back to host
1831 # ifdef HAVE_TPETRA_MMM_TIMINGS
1832  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1833 # endif
1834  typename decltype(crs_rowptr_d)::host_mirror_type crs_rowptr_h(
1835  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1836  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1837  deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1838 
1839  typename decltype(crs_colind_d)::host_mirror_type crs_colind_h(
1840  CRS_colind.getRawPtr(), CRS_colind.size());
1841  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1842  deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1843 
1844  typename decltype(crs_vals_d)::host_mirror_type crs_vals_h(
1845  CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1846  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1847  deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1848 
1849  typename decltype(tgt_pids_d)::host_mirror_type tgt_pids_h(
1850  TargetPids.getRawPtr(), TargetPids.size());
1851  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1852  deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1853 
1854 } //unpackAndCombineIntoCrsArrays
1855 
1856 
1857 } // namespace Details
1858 } // namespace Tpetra
1859 
1860 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON( ST, LO, GO, NT ) \
1861  template void \
1862  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1863  const CrsMatrix<ST, LO, GO, NT>&, \
1864  const Teuchos::ArrayView<const char>&, \
1865  const Teuchos::ArrayView<const size_t>&, \
1866  const Teuchos::ArrayView<const LO>&, \
1867  size_t, \
1868  CombineMode); \
1869  template size_t \
1870  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1871  const CrsMatrix<ST, LO, GO, NT> &, \
1872  const Teuchos::ArrayView<const LO> &, \
1873  const Teuchos::ArrayView<const char> &, \
1874  const Teuchos::ArrayView<const size_t>&, \
1875  size_t, \
1876  CombineMode, \
1877  size_t, \
1878  const Teuchos::ArrayView<const LO>&, \
1879  const Teuchos::ArrayView<const LO>&); \
1880  template void \
1881  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1882  const CrsMatrix<ST, LO, GO, NT>&, \
1883  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1884  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1885  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1886  const size_t, \
1887  const CombineMode); \
1888  template void \
1889  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1890  const CrsMatrix<ST, LO, GO, NT> &, \
1891  const Kokkos::View<LO const *, \
1892  Kokkos::Device<typename NT::device_type::execution_space, \
1893  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1894  void, void >, \
1895  const Kokkos::View<const char*, \
1896  Kokkos::Device<typename NT::device_type::execution_space, \
1897  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1898  void, void >, \
1899  const Kokkos::View<const size_t*, \
1900  Kokkos::Device<typename NT::device_type::execution_space, \
1901  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1902  void, void >, \
1903  const size_t, \
1904  const Kokkos::View<LO const *, \
1905  Kokkos::Device<typename NT::device_type::execution_space, \
1906  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1907  void, void >, \
1908  const Kokkos::View<LO const *, \
1909  Kokkos::Device<typename NT::device_type::execution_space, \
1910  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1911  void, void >, \
1912  size_t, \
1913  const int, \
1914  Kokkos::View<size_t*,typename NT::device_type>&, \
1915  Kokkos::View<GO*,typename NT::device_type>&, \
1916  Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1917  const Teuchos::ArrayView<const int>&, \
1918  Kokkos::View<int*,typename NT::device_type>&); \
1919  template void \
1920  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1921  const CrsMatrix<ST, LO, GO, NT> &, \
1922  const Kokkos::View<LO const *, \
1923  Kokkos::Device<typename NT::device_type::execution_space, \
1924  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1925  void, void >, \
1926  const Kokkos::View<const char*, \
1927  Kokkos::Device<typename NT::device_type::execution_space, \
1928  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1929  void, void >, \
1930  const Kokkos::View<const size_t*, \
1931  Kokkos::Device<typename NT::device_type::execution_space, \
1932  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1933  void, void >, \
1934  const size_t, \
1935  const Kokkos::View<LO const *, \
1936  Kokkos::Device<typename NT::device_type::execution_space, \
1937  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1938  void, void >, \
1939  const Kokkos::View<LO const *, \
1940  Kokkos::Device<typename NT::device_type::execution_space, \
1941  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1942  void, void >, \
1943  size_t, \
1944  const int, \
1945  Teuchos::ArrayRCP<size_t>&, \
1946  Teuchos::ArrayRCP<GO>&, \
1947  Teuchos::ArrayRCP<ST>&, \
1948  const Teuchos::ArrayView<const int>&, \
1949  Teuchos::Array<int>&);
1950 
1951 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF( ST, LO, GO, NT ) \
1952  template void \
1953  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1954  const CrsMatrix<ST, LO, GO, NT>&, \
1955  const Teuchos::ArrayView<const char>&, \
1956  const Teuchos::ArrayView<const size_t>&, \
1957  const Teuchos::ArrayView<const LO>&, \
1958  size_t, \
1959  CombineMode); \
1960  template size_t \
1961  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1962  const CrsMatrix<ST, LO, GO, NT> &, \
1963  const Teuchos::ArrayView<const LO> &, \
1964  const Teuchos::ArrayView<const char> &, \
1965  const Teuchos::ArrayView<const size_t>&, \
1966  size_t, \
1967  CombineMode, \
1968  size_t, \
1969  const Teuchos::ArrayView<const LO>&, \
1970  const Teuchos::ArrayView<const LO>&); \
1971  template void \
1972  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1973  const CrsMatrix<ST, LO, GO, NT>&, \
1974  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1975  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1976  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1977  const size_t, \
1978  const CombineMode); \
1979  template void \
1980  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1981  const CrsMatrix<ST, LO, GO, NT> &, \
1982  const Kokkos::View<LO const *, \
1983  Kokkos::Device<typename NT::device_type::execution_space, \
1984  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1985  const Kokkos::View<const char*, \
1986  Kokkos::Device<typename NT::device_type::execution_space, \
1987  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1988  const Kokkos::View<const size_t*, \
1989  Kokkos::Device<typename NT::device_type::execution_space, \
1990  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1991  const size_t, \
1992  const Kokkos::View<LO const *, \
1993  Kokkos::Device<typename NT::device_type::execution_space, \
1994  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1995  const Kokkos::View<LO const *, \
1996  Kokkos::Device<typename NT::device_type::execution_space, \
1997  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1998  size_t, \
1999  const int, \
2000  Kokkos::View<size_t*,typename NT::device_type>&, \
2001  Kokkos::View<GO*,typename NT::device_type>&, \
2002  Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
2003  const Teuchos::ArrayView<const int>&, \
2004  Kokkos::View<int*,typename NT::device_type>&); \
2005  template void \
2006  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
2007  const CrsMatrix<ST, LO, GO, NT> &, \
2008  const Kokkos::View<LO const *, \
2009  Kokkos::Device<typename NT::device_type::execution_space, \
2010  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2011  const Kokkos::View<const char*, \
2012  Kokkos::Device<typename NT::device_type::execution_space, \
2013  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2014  const Kokkos::View<const size_t*, \
2015  Kokkos::Device<typename NT::device_type::execution_space, \
2016  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2017  const size_t, \
2018  const Kokkos::View<LO const *, \
2019  Kokkos::Device<typename NT::device_type::execution_space, \
2020  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2021  const Kokkos::View<LO const *, \
2022  Kokkos::Device<typename NT::device_type::execution_space, \
2023  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2024  size_t, \
2025  const int, \
2026  Teuchos::ArrayRCP<size_t>&, \
2027  Teuchos::ArrayRCP<GO>&, \
2028  Teuchos::ArrayRCP<ST>&, \
2029  const Teuchos::ArrayView<const int>&, \
2030  Teuchos::Array<int>&);
2031 
2032 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
2033 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
2034  TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON( ST, LO, GO, NT )
2035 #else
2036 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
2037  TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF( ST, LO, GO, NT )
2038 #endif
2039 
2040 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
&quot;Local&quot; part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...