10 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
15 #include "TpetraCore_config.h"
16 #include "Kokkos_Core.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Teuchos_ArrayView.hpp"
19 #include "Teuchos_OrdinalTraits.hpp"
20 #include "Teuchos_TimeMonitor.hpp"
28 #include "Tpetra_Details_DefaultTypes.hpp"
59 namespace UnpackAndCombineCrsMatrixImpl {
70 template<
class ST,
class LO,
class GO>
72 unpackRow(
const typename PackTraits<GO>::output_array_type& gids_out,
74 const typename PackTraits<ST>::output_array_type& vals_out,
79 const size_t bytes_per_value)
85 bool unpack_pids = pids_out.size() > 0;
87 const size_t num_ent_beg = offset;
90 const size_t gids_beg = num_ent_beg + num_ent_len;
91 const size_t gids_len =
94 const size_t pids_beg = gids_beg + gids_len;
95 const size_t pids_len = unpack_pids ?
99 const size_t vals_beg = gids_beg + gids_len + pids_len;
100 const size_t vals_len = num_ent * bytes_per_value;
102 const char*
const num_ent_in = imports + num_ent_beg;
103 const char*
const gids_in = imports + gids_beg;
104 const char*
const pids_in = unpack_pids ? imports + pids_beg :
nullptr;
105 const char*
const vals_in = imports + vals_beg;
107 size_t num_bytes_out = 0;
110 if (static_cast<size_t> (num_ent_out) != num_ent) {
115 Kokkos::pair<int, size_t> p;
120 num_bytes_out += p.second;
127 num_bytes_out += p.second;
134 num_bytes_out += p.second;
137 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
138 if (num_bytes_out != expected_num_bytes) {
154 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
156 typedef LocalMatrix local_matrix_type;
159 typedef typename local_matrix_type::value_type ST;
163 typedef typename DT::execution_space XS;
165 typedef Kokkos::View<const size_t*, BufferDeviceType>
166 num_packets_per_lid_type;
167 typedef Kokkos::View<const size_t*, DT> offsets_type;
168 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
169 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
171 typedef Kokkos::View<int, DT> error_type;
172 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
174 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
175 "LocalMap::local_ordinal_type and "
176 "LocalMatrix::ordinal_type must be the same.");
178 local_matrix_type local_matrix;
180 input_buffer_type imports;
181 num_packets_per_lid_type num_packets_per_lid;
182 import_lids_type import_lids;
183 Kokkos::View<const LO*[2], DT> batch_info;
184 offsets_type offsets;
187 size_t bytes_per_value;
189 error_type error_code;
192 const local_matrix_type& local_matrix_in,
194 const input_buffer_type& imports_in,
195 const num_packets_per_lid_type& num_packets_per_lid_in,
196 const import_lids_type& import_lids_in,
197 const Kokkos::View<
const LO*[2], DT>& batch_info_in,
198 const offsets_type& offsets_in,
200 const size_t batch_size_in,
201 const size_t bytes_per_value_in,
202 const bool atomic_in) :
203 local_matrix (local_matrix_in),
204 local_col_map (local_col_map_in),
205 imports (imports_in),
206 num_packets_per_lid (num_packets_per_lid_in),
207 import_lids (import_lids_in),
208 batch_info (batch_info_in),
209 offsets (offsets_in),
210 combine_mode (combine_mode_in),
211 batch_size (batch_size_in),
212 bytes_per_value (bytes_per_value_in),
217 KOKKOS_INLINE_FUNCTION
218 void operator()(member_type team_member)
const
221 using Kokkos::subview;
222 using Kokkos::MemoryUnmanaged;
224 const LO batch = team_member.league_rank();
225 const LO lid_no = batch_info(batch, 0);
226 const LO batch_no = batch_info(batch, 1);
228 const size_t num_bytes = num_packets_per_lid(lid_no);
235 const LO import_lid = import_lids(lid_no);
236 const size_t buf_size = imports.size();
237 const size_t offset = offsets(lid_no);
241 const char*
const in_buf = imports.data() + offset;
243 const size_t num_entries_in_row =
static_cast<size_t>(num_ent_LO);
246 size_t expected_num_bytes = 0;
253 if (expected_num_bytes > num_bytes)
256 #ifndef KOKKOS_ENABLE_SYCL
258 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
259 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
260 (
int) lid_no, (
int) expected_num_bytes, (
int) num_bytes
263 Kokkos::atomic_compare_exchange(error_code.data(), 0, 21);
267 if (offset > buf_size || offset + num_bytes > buf_size)
270 #ifndef KOKKOS_ENABLE_SYCL
272 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
273 "At row %d, the offset (%d) > buffer size (%d)\n",
274 (
int) lid_no, (
int) offset, (
int) buf_size
277 Kokkos::atomic_compare_exchange(error_code.data(), 0, 22);
282 size_t num_entries_in_batch = 0;
283 if (num_entries_in_row <= batch_size)
284 num_entries_in_batch = num_entries_in_row;
285 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
286 num_entries_in_batch = batch_size;
288 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
291 const size_t num_ent_start = offset;
292 const size_t num_ent_end = num_ent_start + bytes_per_lid;
295 const size_t gids_start = num_ent_end;
296 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
298 const size_t vals_start = gids_end;
300 const size_t shift = batch_no * batch_size;
301 const char*
const num_ent_in = imports.data() + num_ent_start;
302 const char*
const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
303 const char*
const vals_in = imports.data() + vals_start + shift * bytes_per_value;
307 if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
310 #ifndef KOKKOS_ENABLE_SYCL
312 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
313 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
314 (
int) lid_no, (
int) num_entries_in_row, (
int) num_ent_out
317 Kokkos::atomic_compare_exchange(error_code.data(), 0, 23);
320 constexpr
bool matrix_has_sorted_rows =
true;
323 Kokkos::parallel_for(
324 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
330 distance = j * bytes_per_gid;
340 distance = j * bytes_per_value;
343 if (combine_mode ==
ADD) {
347 const bool use_atomic_updates = atomic;
348 (void)local_matrix.sumIntoValues(
353 matrix_has_sorted_rows,
356 }
else if (combine_mode ==
REPLACE) {
360 const bool use_atomic_updates =
false;
361 (void)local_matrix.replaceValues(
366 matrix_has_sorted_rows,
372 #ifndef KOKKOS_ENABLE_SYCL
374 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
375 "At row %d, an unknown error occurred during unpack\n", (
int) lid_no
378 Kokkos::atomic_compare_exchange(error_code.data(), 0, 31);
383 team_member.team_barrier();
389 auto error_code_h = Kokkos::create_mirror_view_and_copy(
390 Kokkos::HostSpace(), error_code
392 return error_code_h();
397 struct MaxNumEntTag {};
398 struct TotNumEntTag {};
408 template<
class LO,
class DT,
class BDT>
411 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
412 typedef Kokkos::View<const size_t*, DT> offsets_type;
413 typedef Kokkos::View<const char*, BDT> input_buffer_type;
416 typedef size_t value_type;
419 num_packets_per_lid_type num_packets_per_lid;
420 offsets_type offsets;
421 input_buffer_type imports;
425 const offsets_type& offsets_in,
426 const input_buffer_type& imports_in) :
427 num_packets_per_lid (num_packets_per_lid_in),
428 offsets (offsets_in),
432 KOKKOS_INLINE_FUNCTION
void
433 operator() (
const MaxNumEntTag,
const LO i, value_type& update)
const {
435 const size_t num_bytes = num_packets_per_lid(i);
438 const char*
const in_buf = imports.data () + offsets(i);
440 const size_t num_ent =
static_cast<size_t> (num_ent_LO);
442 update = (update < num_ent) ? num_ent : update;
446 KOKKOS_INLINE_FUNCTION
void
447 join (
const MaxNumEntTag,
449 const value_type& src)
const
451 if (dst < src) dst = src;
454 KOKKOS_INLINE_FUNCTION
void
455 operator() (
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
457 const size_t num_bytes = num_packets_per_lid(i);
460 const char*
const in_buf = imports.data () + offsets(i);
462 tot_num_ent +=
static_cast<size_t> (num_ent_LO);
474 template<
class LO,
class DT,
class BDT>
476 compute_maximum_num_entries (
477 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
478 const Kokkos::View<const size_t*, DT>& offsets,
479 const Kokkos::View<const char*, BDT>& imports)
481 typedef typename DT::execution_space XS;
482 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
483 MaxNumEntTag> range_policy;
487 const LO numRowsToUnpack =
488 static_cast<LO
> (num_packets_per_lid.extent (0));
489 size_t max_num_ent = 0;
490 Kokkos::parallel_reduce (
"Max num entries in CRS",
491 range_policy (0, numRowsToUnpack),
492 functor, max_num_ent);
503 template<
class LO,
class DT,
class BDT>
505 compute_total_num_entries (
506 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
507 const Kokkos::View<const size_t*, DT>& offsets,
508 const Kokkos::View<const char*, BDT>& imports)
510 typedef typename DT::execution_space XS;
511 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
512 size_t tot_num_ent = 0;
513 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
515 const LO numRowsToUnpack =
516 static_cast<LO
> (num_packets_per_lid.extent (0));
517 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
518 range_policy (0, numRowsToUnpack),
519 functor, tot_num_ent);
524 KOKKOS_INLINE_FUNCTION
526 unpackRowCount(
const char imports[],
528 const size_t num_bytes)
530 using PT = PackTraits<LO>;
534 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
535 if (p_num_bytes > num_bytes) {
536 return OrdinalTraits<size_t>::invalid();
538 const char*
const in_buf = imports + offset;
539 (void) PT::unpackValue(num_ent_LO, in_buf);
541 return static_cast<size_t>(num_ent_LO);
548 template<
class View1,
class View2>
552 const View1& batches_per_lid,
556 using LO =
typename View2::value_type;
558 for (
size_t i=0; i<batches_per_lid.extent(0); i++)
560 for (
size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
562 batch_info(batch, 0) =
static_cast<LO
>(i);
563 batch_info(batch, 1) = batch_no;
567 return batch == batch_info.extent(0);
577 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
579 unpackAndCombineIntoCrsMatrix(
580 const LocalMatrix& local_matrix,
581 const LocalMap& local_map,
582 const Kokkos::View<const char*, BufferDeviceType>& imports,
583 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
584 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
587 using ST =
typename LocalMatrix::value_type;
590 using XS =
typename DT::execution_space;
591 const char prefix[] =
592 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
593 "unpackAndCombineIntoCrsMatrix: ";
595 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
596 if (num_import_lids == 0) {
603 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
604 std::invalid_argument,
605 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
606 "static graph (i.e., was constructed with the CrsMatrix constructor "
607 "that takes a const CrsGraph pointer).");
609 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
610 std::invalid_argument,
611 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
612 "(i.e., was constructed with the CrsMatrix constructor that takes a "
613 "const CrsGraph pointer).");
616 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
617 std::invalid_argument,
618 prefix <<
"Invalid combine mode; should never get "
619 "here! Please report this bug to the Tpetra developers.");
622 bool bad_num_import_lids =
623 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
624 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
625 std::invalid_argument,
626 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
627 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
631 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
635 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
637 const size_t batch_size = std::min(default_batch_size, max_num_ent);
640 size_t num_batches = 0;
641 Kokkos::View<LO*[2], DT> batch_info(
"", num_batches);
642 Kokkos::View<size_t*, DT> batches_per_lid(
"", num_import_lids);
644 Kokkos::parallel_reduce(
645 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
646 KOKKOS_LAMBDA(
const size_t i,
size_t& batches)
648 const size_t num_entries_in_row = unpackRowCount<LO>(
649 imports.data(), offsets(i), num_packets_per_lid(i)
652 (num_entries_in_row <= batch_size) ?
654 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
655 batches += batches_per_lid(i);
659 Kokkos::resize(batch_info, num_batches);
661 Kokkos::HostSpace host_space;
662 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
666 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
668 (void) compute_batch_info(batches_per_lid_h, batch_info_h);
677 const bool atomic = XS().concurrency() != 1;
678 using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
693 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
695 if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
697 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
701 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
704 auto error_code = f.error();
705 TEUCHOS_TEST_FOR_EXCEPTION(
708 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
712 template<
class LocalMatrix,
class BufferDeviceType>
715 const LocalMatrix& local_matrix,
716 const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
717 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
718 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
719 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
721 const Kokkos::View<const char*, BufferDeviceType>& imports,
722 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
724 const size_t num_same_ids)
726 using Kokkos::parallel_reduce;
727 typedef typename LocalMatrix::ordinal_type LO;
728 typedef typename LocalMatrix::device_type device_type;
729 typedef typename device_type::execution_space XS;
730 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
731 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
732 typedef BufferDeviceType BDT;
738 num_items =
static_cast<LO
>(num_same_ids);
741 parallel_reduce(range_policy(0, num_items),
742 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
743 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid+1]
744 -local_matrix.graph.row_map[lid]);
750 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
753 parallel_reduce(range_policy(0, num_items),
754 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
755 const LO lid = permute_from_lids(i);
756 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
757 - local_matrix.graph.row_map[lid]);
764 const size_type np = num_packets_per_lid.extent(0);
765 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
768 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
776 template<
class LO,
class DT,
class BDT>
778 setupRowPointersForRemotes(
779 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
781 const Kokkos::View<const char*, BDT>& imports,
782 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
783 const typename PackTraits<size_t>::input_array_type& offsets)
785 using Kokkos::parallel_reduce;
786 typedef typename DT::execution_space XS;
787 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
788 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
790 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
791 const size_type N = num_packets_per_lid.extent(0);
794 parallel_reduce (
"Setup row pointers for remotes",
796 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
797 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
798 const size_t num_bytes = num_packets_per_lid(i);
799 const size_t offset = offsets(i);
800 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
801 if (num_ent == InvalidNum) {
804 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
812 makeCrsRowPtrFromLengths(
813 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
814 const Kokkos::View<size_t*,DT>& new_start_row)
816 using Kokkos::parallel_scan;
817 typedef typename DT::execution_space XS;
818 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
819 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
820 const size_type N = new_start_row.extent(0);
821 parallel_scan(range_policy(0, N),
822 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
823 auto cur_val = tgt_rowptr(i);
825 tgt_rowptr(i) = update;
826 new_start_row(i) = tgt_rowptr(i);
833 template<
class LocalMatrix,
class LocalMap>
836 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
838 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
839 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
840 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
842 const LocalMatrix& local_matrix,
843 const LocalMap& local_col_map,
844 const size_t num_same_ids,
847 using Kokkos::parallel_for;
850 typedef typename DT::execution_space XS;
851 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
853 parallel_for(range_policy(0, num_same_ids),
854 KOKKOS_LAMBDA(
const size_t i) {
855 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
857 const LO src_lid =
static_cast<LO
>(i);
858 size_t src_row = local_matrix.graph.row_map(src_lid);
860 const LO tgt_lid =
static_cast<LO
>(i);
861 const size_t tgt_row = tgt_rowptr(tgt_lid);
863 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
864 - local_matrix.graph.row_map(src_lid);
865 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
867 for (
size_t j=local_matrix.graph.row_map(src_lid);
868 j<local_matrix.graph.row_map(src_lid+1); ++j) {
869 LO src_col = local_matrix.graph.entries(j);
870 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
871 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
872 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
878 template<
class LocalMatrix,
class LocalMap>
880 copyDataFromPermuteIDs(
881 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
883 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
884 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
885 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
887 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
888 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
889 const LocalMatrix& local_matrix,
890 const LocalMap& local_col_map,
893 using Kokkos::parallel_for;
896 typedef typename DT::execution_space XS;
897 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
898 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
900 const size_type num_permute_to_lids = permute_to_lids.extent(0);
902 parallel_for(range_policy(0, num_permute_to_lids),
903 KOKKOS_LAMBDA(
const size_t i) {
904 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
906 const LO src_lid = permute_from_lids(i);
907 const size_t src_row = local_matrix.graph.row_map(src_lid);
909 const LO tgt_lid = permute_to_lids(i);
910 const size_t tgt_row = tgt_rowptr(tgt_lid);
912 size_t nsr = local_matrix.graph.row_map(src_lid+1)
913 - local_matrix.graph.row_map(src_lid);
914 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
916 for (
size_t j=local_matrix.graph.row_map(src_lid);
917 j<local_matrix.graph.row_map(src_lid+1); ++j) {
918 LO src_col = local_matrix.graph.entries(j);
919 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
920 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
921 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
927 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
929 unpackAndCombineIntoCrsArrays2(
930 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
932 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
933 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
934 const typename PackTraits<size_t>::input_array_type& offsets,
935 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
936 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
937 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
938 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
940 const Kokkos::View<const char*, BufferDeviceType>& imports,
941 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
946 const size_t bytes_per_value)
949 using Kokkos::subview;
950 using Kokkos::MemoryUnmanaged;
951 using Kokkos::parallel_reduce;
952 using Kokkos::atomic_fetch_add;
957 typedef typename LocalMatrix::value_type ST;
958 typedef typename DT::execution_space XS;
959 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
960 typedef typename Kokkos::pair<size_type, size_type> slice;
961 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
963 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
964 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
965 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
967 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
970 const size_type num_import_lids = import_lids.size();
973 parallel_reduce (
"Unpack and combine into CRS",
974 range_policy (0, num_import_lids),
975 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
976 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
977 const size_t num_bytes = num_packets_per_lid(i);
978 const size_t offset = offsets(i);
979 if (num_bytes == 0) {
983 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
984 if (num_ent == InvalidNum) {
988 const LO lcl_row = import_lids(i);
989 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
990 const size_t end_row = start_row + num_ent;
992 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
993 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
994 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
996 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
997 imports.data(), offset, num_bytes,
998 num_ent, bytes_per_value);
1001 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1002 const int pid = pids_out(j);
1003 pids_out(j) = (pid != my_pid) ? pid : -1;
1010 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1013 const LocalMatrix & local_matrix,
1014 const LocalMap & local_col_map,
1015 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1016 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1017 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1018 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1020 const Kokkos::View<const char*, BufferDeviceType>& imports,
1021 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1023 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1024 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1025 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1026 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1027 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1030 const size_t num_same_ids,
1031 const size_t tgt_num_rows,
1032 const size_t tgt_num_nonzeros,
1033 const int my_tgt_pid,
1034 const size_t bytes_per_value)
1037 using Kokkos::subview;
1038 using Kokkos::parallel_for;
1039 using Kokkos::MemoryUnmanaged;
1043 typedef typename DT::execution_space XS;
1044 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1045 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1046 typedef BufferDeviceType BDT;
1048 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1050 const size_t N = tgt_num_rows;
1054 const int my_pid = my_tgt_pid;
1057 parallel_for(range_policy(0, N+1),
1058 KOKKOS_LAMBDA(
const size_t i) {
1064 parallel_for(range_policy(0, num_same_ids),
1065 KOKKOS_LAMBDA(
const size_t i) {
1066 const LO tgt_lid =
static_cast<LO
>(i);
1067 const LO src_lid =
static_cast<LO
>(i);
1068 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1069 - local_matrix.graph.row_map(src_lid);
1074 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1075 parallel_for(range_policy(0, num_permute_to_lids),
1076 KOKKOS_LAMBDA(
const size_t i) {
1077 const LO tgt_lid = permute_to_lids(i);
1078 const LO src_lid = permute_from_lids(i);
1079 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1080 - local_matrix.graph.row_map(src_lid);
1085 const size_type num_import_lids = import_lids.extent(0);
1086 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1089 #ifdef HAVE_TPETRA_DEBUG
1091 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1092 const bool condition =
1093 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1094 TEUCHOS_TEST_FOR_EXCEPTION
1095 (condition, std::logic_error, prefix
1096 <<
"The final offset in bytes " << nth_offset_h
1097 <<
" != imports.size() = " << imports.extent(0)
1098 <<
". Please report this bug to the Tpetra developers.");
1100 #endif // HAVE_TPETRA_DEBUG
1104 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1105 import_lids, imports, num_packets_per_lid, offsets);
1106 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1107 <<
" Error transferring data to target row pointers. "
1108 "Please report this bug to the Tpetra developers.");
1112 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1115 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1118 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1119 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1121 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1122 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1123 local_matrix, local_col_map, my_pid);
1125 if (imports.extent(0) <= 0) {
1129 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1130 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1131 local_matrix, local_col_map, my_pid, bytes_per_value);
1132 TEUCHOS_TEST_FOR_EXCEPTION(
1133 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1134 "should never happen. Please report this bug to the Tpetra developers.");
1180 template<
typename ST,
typename LO,
typename GO,
typename Node>
1184 const Teuchos::ArrayView<const char>& imports,
1185 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1186 const Teuchos::ArrayView<const LO>& importLIDs,
1191 typedef typename Node::device_type device_type;
1193 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1194 "Node::device_type and LocalMatrix::device_type must be the same.");
1197 device_type outputDevice;
1202 auto num_packets_per_lid_d =
1204 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1206 auto import_lids_d =
1208 importLIDs.size(),
true,
"import_lids");
1212 imports.size(),
true,
"imports");
1215 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1226 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1227 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1228 import_lids_d, combineMode);
1232 template<
typename ST,
typename LO,
typename GO,
typename NT>
1234 unpackCrsMatrixAndCombineNew(
1236 Kokkos::DualView<
char*,
1238 Kokkos::DualView<
size_t*,
1240 const Kokkos::DualView<
const LO*,
1248 using device_type =
typename crs_matrix_type::device_type;
1249 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1250 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1253 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1254 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1255 "must be the same.");
1257 if (numPacketsPerLID.need_sync_device()) {
1258 numPacketsPerLID.sync_device ();
1260 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1262 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1263 auto import_lids_d = importLIDs.view_device ();
1265 if (imports.need_sync_device()) {
1266 imports.sync_device ();
1268 auto imports_d = imports.view_device ();
1271 auto local_col_map = sourceMatrix.
getColMap ()->getLocalMap ();
1272 typedef decltype (local_col_map) local_map_type;
1274 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1275 local_matrix_device_type,
1278 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1279 import_lids_d, combineMode);
1337 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1340 const
CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1341 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1342 const Teuchos::ArrayView<const
char> &imports,
1343 const Teuchos::ArrayView<const
size_t>& numPacketsPerLID,
1347 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1348 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1350 using Kokkos::MemoryUnmanaged;
1352 typedef typename Node::device_type DT;
1353 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1355 TEUCHOS_TEST_FOR_EXCEPTION
1356 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1357 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size () <<
" != "
1358 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1361 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1362 TEUCHOS_TEST_FOR_EXCEPTION
1363 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1364 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1365 TEUCHOS_TEST_FOR_EXCEPTION
1366 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1367 prefix <<
"importLIDs.size() = " << importLIDs.size () <<
" != "
1368 "numPacketsPerLID.size() = " << numPacketsPerLID.size () <<
".");
1370 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1372 using kokkos_device_type = Kokkos::Device<
typename Node::device_type::execution_space,
1373 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1375 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1376 Kokkos::View<LocalOrdinal const *, kokkos_device_type, void, void > permute_from_lids_d =
1378 Kokkos::View<LocalOrdinal const *, kokkos_device_type> permute_from_lids_d =
1381 permuteFromLIDs.getRawPtr (),
1382 permuteFromLIDs.size (),
true,
1383 "permute_from_lids");
1385 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1386 Kokkos::View<const char*, kokkos_device_type, void, void > imports_d =
1388 Kokkos::View<const char*, kokkos_device_type> imports_d =
1391 imports.getRawPtr (),
1392 imports.size (),
true,
1395 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1396 Kokkos::View<const size_t*, kokkos_device_type, void, void > num_packets_per_lid_d =
1398 Kokkos::View<const size_t*, kokkos_device_type> num_packets_per_lid_d =
1401 numPacketsPerLID.getRawPtr (),
1402 numPacketsPerLID.size (),
true,
1403 "num_packets_per_lid");
1406 local_matrix, permute_from_lids_d, imports_d,
1407 num_packets_per_lid_d, numSameIDs);
1425 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1429 const Kokkos::View<LocalOrdinal
const *,
1430 Kokkos::Device<
typename Node::device_type::execution_space,
1431 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1432 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1436 const Kokkos::View<
const char*,
1437 Kokkos::Device<
typename Node::device_type::execution_space,
1438 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1439 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1443 const Kokkos::View<
const size_t*,
1444 Kokkos::Device<
typename Node::device_type::execution_space,
1445 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1446 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1449 > num_packets_per_lid_d,
1450 const size_t numSameIDs,
1451 const Kokkos::View<LocalOrdinal
const *,
1452 Kokkos::Device<
typename Node::device_type::execution_space,
1453 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1454 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1457 > permute_to_lids_d,
1458 const Kokkos::View<LocalOrdinal
const *,
1459 Kokkos::Device<
typename Node::device_type::execution_space,
1460 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1461 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1464 > permute_from_lids_d,
1465 size_t TargetNumRows,
1466 const int MyTargetPID,
1467 Kokkos::View<size_t*,typename Node::device_type> &crs_rowptr_d,
1468 Kokkos::View<GlobalOrdinal*,typename Node::device_type> &crs_colind_d,
1470 const Teuchos::ArrayView<const int>& SourcePids,
1471 Kokkos::View<int*,typename Node::device_type> &TargetPids)
1473 using execution_space =
typename Node::execution_space;
1479 using Teuchos::ArrayView;
1480 using Teuchos::outArg;
1481 using Teuchos::REDUCE_MAX;
1482 using Teuchos::reduceAll;
1484 typedef typename Node::device_type DT;
1487 typedef typename matrix_type::impl_scalar_type ST;
1489 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1490 # ifdef HAVE_TPETRA_MMM_TIMINGS
1491 using Teuchos::TimeMonitor;
1492 Teuchos::RCP<TimeMonitor> tm;
1495 using Kokkos::MemoryUnmanaged;
1497 TEUCHOS_TEST_FOR_EXCEPTION
1498 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1499 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size () <<
" != "
1500 "permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1504 TEUCHOS_TEST_FOR_EXCEPTION
1505 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1506 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1507 TEUCHOS_TEST_FOR_EXCEPTION
1508 (((
size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1509 prefix <<
"import_lids_d.size() = " << import_lids_d.size () <<
" != "
1510 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () <<
".");
1515 # ifdef HAVE_TPETRA_MMM_TIMINGS
1516 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1518 size_t TargetNumNonzeros =
1520 local_matrix, permute_from_lids_d, imports_d,
1521 num_packets_per_lid_d, numSameIDs);
1522 # ifdef HAVE_TPETRA_MMM_TIMINGS
1526 # ifdef HAVE_TPETRA_MMM_TIMINGS
1527 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1529 Kokkos::resize(crs_rowptr_d,TargetNumRows+1);
1530 Kokkos::resize(crs_colind_d,TargetNumNonzeros);
1531 Kokkos::resize(crs_vals_d,TargetNumNonzeros);
1532 # ifdef HAVE_TPETRA_MMM_TIMINGS
1536 TEUCHOS_TEST_FOR_EXCEPTION(
1537 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1538 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size ()
1539 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size () <<
".");
1541 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1542 Kokkos::resize(TargetPids,TargetNumNonzeros);
1547 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1549 # ifdef HAVE_TPETRA_MMM_TIMINGS
1550 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1557 SourcePids.size(),
true,
"src_pids");
1559 # ifdef HAVE_TPETRA_MMM_TIMINGS
1563 size_t bytes_per_value = 0;
1577 size_t bytes_per_value_l = 0;
1578 if (local_matrix.values.extent(0) > 0) {
1579 const ST& val = local_matrix.values(0);
1582 const ST& val = crs_vals_d(0);
1585 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1586 Teuchos::REDUCE_MAX,
1588 outArg(bytes_per_value));
1591 # ifdef HAVE_TPETRA_MMM_TIMINGS
1592 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1595 local_matrix, local_col_map, import_lids_d, imports_d,
1596 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1597 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1598 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1600 # ifdef HAVE_TPETRA_MMM_TIMINGS
1605 # ifdef HAVE_TPETRA_MMM_TIMINGS
1606 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1609 Kokkos::parallel_for(
"setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,TargetPids.size()), KOKKOS_LAMBDA (
const size_t i) {
1610 if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1615 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1619 const Kokkos::View<LocalOrdinal
const *,
1620 Kokkos::Device<
typename Node::device_type::execution_space,
1621 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1622 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1626 const Kokkos::View<
const char*,
1627 Kokkos::Device<
typename Node::device_type::execution_space,
1628 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1629 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1633 const Kokkos::View<
const size_t*,
1634 Kokkos::Device<
typename Node::device_type::execution_space,
1635 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1636 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1639 > num_packets_per_lid_d,
1640 const size_t numSameIDs,
1641 const Kokkos::View<LocalOrdinal
const *,
1642 Kokkos::Device<
typename Node::device_type::execution_space,
1643 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1644 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1647 > permute_to_lids_d,
1648 const Kokkos::View<LocalOrdinal
const *,
1649 Kokkos::Device<
typename Node::device_type::execution_space,
1650 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1651 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1654 > permute_from_lids_d,
1655 size_t TargetNumRows,
1656 const int MyTargetPID,
1657 Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1658 Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1659 Teuchos::ArrayRCP<Scalar>& CRS_vals,
1660 const Teuchos::ArrayView<const int>& SourcePids,
1661 Teuchos::Array<int>& TargetPids)
1663 using execution_space =
typename Node::execution_space;
1669 using Teuchos::ArrayView;
1670 using Teuchos::outArg;
1671 using Teuchos::REDUCE_MAX;
1672 using Teuchos::reduceAll;
1674 typedef typename Node::device_type DT;
1677 typedef typename matrix_type::impl_scalar_type ST;
1679 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1680 # ifdef HAVE_TPETRA_MMM_TIMINGS
1681 using Teuchos::TimeMonitor;
1682 Teuchos::RCP<TimeMonitor> tm;
1685 using Kokkos::MemoryUnmanaged;
1687 TEUCHOS_TEST_FOR_EXCEPTION
1688 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1689 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size () <<
" != "
1690 "permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1694 TEUCHOS_TEST_FOR_EXCEPTION
1695 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1696 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1697 TEUCHOS_TEST_FOR_EXCEPTION
1698 (((
size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1699 prefix <<
"import_lids_d.size() = " << import_lids_d.size () <<
" != "
1700 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () <<
".");
1705 # ifdef HAVE_TPETRA_MMM_TIMINGS
1706 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1708 size_t TargetNumNonzeros =
1710 local_matrix, permute_from_lids_d, imports_d,
1711 num_packets_per_lid_d, numSameIDs);
1712 # ifdef HAVE_TPETRA_MMM_TIMINGS
1716 # ifdef HAVE_TPETRA_MMM_TIMINGS
1717 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1719 CRS_rowptr.resize (TargetNumRows+1);
1720 CRS_colind.resize(TargetNumNonzeros);
1721 CRS_vals.resize(TargetNumNonzeros);
1722 Teuchos::ArrayRCP<ST>
const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1723 # ifdef HAVE_TPETRA_MMM_TIMINGS
1727 TEUCHOS_TEST_FOR_EXCEPTION(
1728 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1729 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size ()
1730 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size () <<
".");
1733 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1734 TargetPids.resize (TargetNumNonzeros);
1736 TargetPids.assign (TargetNumNonzeros, -1);
1739 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1741 # ifdef HAVE_TPETRA_MMM_TIMINGS
1742 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1749 CRS_rowptr.size(),
true,
"crs_rowptr");
1753 CRS_colind.size(),
true,
"crs_colidx");
1754 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1755 static_assert (! std::is_same<
1756 typename std::remove_const<
1757 typename std::decay<
1758 decltype (CRS_vals_impl_scalar_type)
1761 std::complex<double> >::value,
1762 "CRS_vals::value_type is std::complex<double>; this should never happen"
1763 ", since std::complex does not work in Kokkos::View objects.");
1764 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1768 CRS_vals_impl_scalar_type.size(),
true,
"crs_vals");
1770 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1771 static_assert (! std::is_same<
1772 typename decltype (crs_vals_d)::non_const_value_type,
1773 std::complex<double> >::value,
1774 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1775 "never happen, since std::complex does not work in Kokkos::View objects.");
1776 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1780 SourcePids.size(),
true,
"src_pids");
1784 TargetPids.size(),
true,
"tgt_pids");
1786 # ifdef HAVE_TPETRA_MMM_TIMINGS
1790 size_t bytes_per_value = 0;
1804 size_t bytes_per_value_l = 0;
1805 if (local_matrix.values.extent(0) > 0) {
1806 const ST& val = local_matrix.values(0);
1809 const ST& val = crs_vals_d(0);
1812 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1813 Teuchos::REDUCE_MAX,
1815 outArg(bytes_per_value));
1818 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1819 static_assert (! std::is_same<
1820 typename decltype (crs_vals_d)::non_const_value_type,
1821 std::complex<double> >::value,
1822 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1823 "never happen, since std::complex does not work in Kokkos::View objects.");
1824 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1826 # ifdef HAVE_TPETRA_MMM_TIMINGS
1827 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1830 local_matrix, local_col_map, import_lids_d, imports_d,
1831 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1832 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1833 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1835 # ifdef HAVE_TPETRA_MMM_TIMINGS
1840 # ifdef HAVE_TPETRA_MMM_TIMINGS
1841 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1843 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1844 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1846 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1848 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1849 CRS_colind.getRawPtr(), CRS_colind.size());
1851 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1853 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1854 CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1856 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1858 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1859 TargetPids.getRawPtr(), TargetPids.size());
1861 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1869 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON( ST, LO, GO, NT ) \
1871 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1872 const CrsMatrix<ST, LO, GO, NT>&, \
1873 const Teuchos::ArrayView<const char>&, \
1874 const Teuchos::ArrayView<const size_t>&, \
1875 const Teuchos::ArrayView<const LO>&, \
1879 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1880 const CrsMatrix<ST, LO, GO, NT> &, \
1881 const Teuchos::ArrayView<const LO> &, \
1882 const Teuchos::ArrayView<const char> &, \
1883 const Teuchos::ArrayView<const size_t>&, \
1887 const Teuchos::ArrayView<const LO>&, \
1888 const Teuchos::ArrayView<const LO>&); \
1890 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1891 const CrsMatrix<ST, LO, GO, NT>&, \
1892 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1893 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1894 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1896 const CombineMode); \
1898 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1899 const CrsMatrix<ST, LO, GO, NT> &, \
1900 const Kokkos::View<LO const *, \
1901 Kokkos::Device<typename NT::device_type::execution_space, \
1902 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1904 const Kokkos::View<const char*, \
1905 Kokkos::Device<typename NT::device_type::execution_space, \
1906 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1908 const Kokkos::View<const size_t*, \
1909 Kokkos::Device<typename NT::device_type::execution_space, \
1910 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1913 const Kokkos::View<LO const *, \
1914 Kokkos::Device<typename NT::device_type::execution_space, \
1915 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1917 const Kokkos::View<LO const *, \
1918 Kokkos::Device<typename NT::device_type::execution_space, \
1919 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1923 Kokkos::View<size_t*,typename NT::device_type>&, \
1924 Kokkos::View<GO*,typename NT::device_type>&, \
1925 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1926 const Teuchos::ArrayView<const int>&, \
1927 Kokkos::View<int*,typename NT::device_type>&); \
1929 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1930 const CrsMatrix<ST, LO, GO, NT> &, \
1931 const Kokkos::View<LO const *, \
1932 Kokkos::Device<typename NT::device_type::execution_space, \
1933 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1935 const Kokkos::View<const char*, \
1936 Kokkos::Device<typename NT::device_type::execution_space, \
1937 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1939 const Kokkos::View<const size_t*, \
1940 Kokkos::Device<typename NT::device_type::execution_space, \
1941 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1944 const Kokkos::View<LO const *, \
1945 Kokkos::Device<typename NT::device_type::execution_space, \
1946 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1948 const Kokkos::View<LO const *, \
1949 Kokkos::Device<typename NT::device_type::execution_space, \
1950 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1954 Teuchos::ArrayRCP<size_t>&, \
1955 Teuchos::ArrayRCP<GO>&, \
1956 Teuchos::ArrayRCP<ST>&, \
1957 const Teuchos::ArrayView<const int>&, \
1958 Teuchos::Array<int>&);
1960 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF( ST, LO, GO, NT ) \
1962 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1963 const CrsMatrix<ST, LO, GO, NT>&, \
1964 const Teuchos::ArrayView<const char>&, \
1965 const Teuchos::ArrayView<const size_t>&, \
1966 const Teuchos::ArrayView<const LO>&, \
1970 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1971 const CrsMatrix<ST, LO, GO, NT> &, \
1972 const Teuchos::ArrayView<const LO> &, \
1973 const Teuchos::ArrayView<const char> &, \
1974 const Teuchos::ArrayView<const size_t>&, \
1978 const Teuchos::ArrayView<const LO>&, \
1979 const Teuchos::ArrayView<const LO>&); \
1981 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1982 const CrsMatrix<ST, LO, GO, NT>&, \
1983 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1984 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1985 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1987 const CombineMode); \
1989 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1990 const CrsMatrix<ST, LO, GO, NT> &, \
1991 const Kokkos::View<LO const *, \
1992 Kokkos::Device<typename NT::device_type::execution_space, \
1993 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1994 const Kokkos::View<const char*, \
1995 Kokkos::Device<typename NT::device_type::execution_space, \
1996 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1997 const Kokkos::View<const size_t*, \
1998 Kokkos::Device<typename NT::device_type::execution_space, \
1999 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2001 const Kokkos::View<LO const *, \
2002 Kokkos::Device<typename NT::device_type::execution_space, \
2003 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2004 const Kokkos::View<LO const *, \
2005 Kokkos::Device<typename NT::device_type::execution_space, \
2006 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2009 Kokkos::View<size_t*,typename NT::device_type>&, \
2010 Kokkos::View<GO*,typename NT::device_type>&, \
2011 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
2012 const Teuchos::ArrayView<const int>&, \
2013 Kokkos::View<int*,typename NT::device_type>&); \
2015 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
2016 const CrsMatrix<ST, LO, GO, NT> &, \
2017 const Kokkos::View<LO const *, \
2018 Kokkos::Device<typename NT::device_type::execution_space, \
2019 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2020 const Kokkos::View<const char*, \
2021 Kokkos::Device<typename NT::device_type::execution_space, \
2022 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2023 const Kokkos::View<const size_t*, \
2024 Kokkos::Device<typename NT::device_type::execution_space, \
2025 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2027 const Kokkos::View<LO const *, \
2028 Kokkos::Device<typename NT::device_type::execution_space, \
2029 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2030 const Kokkos::View<LO const *, \
2031 Kokkos::Device<typename NT::device_type::execution_space, \
2032 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2035 Teuchos::ArrayRCP<size_t>&, \
2036 Teuchos::ArrayRCP<GO>&, \
2037 Teuchos::ArrayRCP<ST>&, \
2038 const Teuchos::ArrayView<const int>&, \
2039 Teuchos::Array<int>&);
2041 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
2042 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
2043 TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON( ST, LO, GO, NT )
2045 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
2046 TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF( ST, LO, GO, NT )
2049 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
"Local" part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don't currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
int error() const
Host function for getting the error.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
Unpacks and combines a single row of the CrsMatrix.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...