10 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
15 #include "TpetraCore_config.h"
16 #include "Kokkos_Core.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Teuchos_ArrayView.hpp"
19 #include "Teuchos_OrdinalTraits.hpp"
20 #include "Teuchos_TimeMonitor.hpp"
28 #include "Tpetra_Details_DefaultTypes.hpp"
59 namespace UnpackAndCombineCrsMatrixImpl {
70 template<
class ST,
class LO,
class GO>
72 unpackRow(
const typename PackTraits<GO>::output_array_type& gids_out,
74 const typename PackTraits<ST>::output_array_type& vals_out,
79 const size_t bytes_per_value)
85 bool unpack_pids = pids_out.size() > 0;
87 const size_t num_ent_beg = offset;
90 const size_t gids_beg = num_ent_beg + num_ent_len;
91 const size_t gids_len =
94 const size_t pids_beg = gids_beg + gids_len;
95 const size_t pids_len = unpack_pids ?
99 const size_t vals_beg = gids_beg + gids_len + pids_len;
100 const size_t vals_len = num_ent * bytes_per_value;
102 const char*
const num_ent_in = imports + num_ent_beg;
103 const char*
const gids_in = imports + gids_beg;
104 const char*
const pids_in = unpack_pids ? imports + pids_beg :
nullptr;
105 const char*
const vals_in = imports + vals_beg;
107 size_t num_bytes_out = 0;
110 if (static_cast<size_t> (num_ent_out) != num_ent) {
115 Kokkos::pair<int, size_t> p;
120 num_bytes_out += p.second;
127 num_bytes_out += p.second;
134 num_bytes_out += p.second;
137 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
138 if (num_bytes_out != expected_num_bytes) {
154 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
156 typedef LocalMatrix local_matrix_type;
159 typedef typename local_matrix_type::value_type ST;
163 typedef typename DT::execution_space XS;
165 typedef Kokkos::View<const size_t*, BufferDeviceType>
166 num_packets_per_lid_type;
167 typedef Kokkos::View<const size_t*, DT> offsets_type;
168 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
169 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
171 typedef Kokkos::View<int, DT> error_type;
172 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
174 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
175 "LocalMap::local_ordinal_type and "
176 "LocalMatrix::ordinal_type must be the same.");
178 local_matrix_type local_matrix;
180 input_buffer_type imports;
181 num_packets_per_lid_type num_packets_per_lid;
182 import_lids_type import_lids;
183 Kokkos::View<const LO*[2], DT> batch_info;
184 offsets_type offsets;
187 size_t bytes_per_value;
189 error_type error_code;
192 const local_matrix_type& local_matrix_in,
194 const input_buffer_type& imports_in,
195 const num_packets_per_lid_type& num_packets_per_lid_in,
196 const import_lids_type& import_lids_in,
197 const Kokkos::View<
const LO*[2], DT>& batch_info_in,
198 const offsets_type& offsets_in,
200 const size_t batch_size_in,
201 const size_t bytes_per_value_in,
202 const bool atomic_in) :
203 local_matrix (local_matrix_in),
204 local_col_map (local_col_map_in),
205 imports (imports_in),
206 num_packets_per_lid (num_packets_per_lid_in),
207 import_lids (import_lids_in),
208 batch_info (batch_info_in),
209 offsets (offsets_in),
210 combine_mode (combine_mode_in),
211 batch_size (batch_size_in),
212 bytes_per_value (bytes_per_value_in),
217 KOKKOS_INLINE_FUNCTION
218 void operator()(member_type team_member)
const
221 using Kokkos::subview;
222 using Kokkos::MemoryUnmanaged;
224 const LO batch = team_member.league_rank();
225 const LO lid_no = batch_info(batch, 0);
226 const LO batch_no = batch_info(batch, 1);
228 const size_t num_bytes = num_packets_per_lid(lid_no);
235 const LO import_lid = import_lids(lid_no);
236 const size_t buf_size = imports.size();
237 const size_t offset = offsets(lid_no);
241 const char*
const in_buf = imports.data() + offset;
243 const size_t num_entries_in_row =
static_cast<size_t>(num_ent_LO);
246 size_t expected_num_bytes = 0;
253 if (expected_num_bytes > num_bytes)
256 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
257 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
258 (
int) lid_no, (
int) expected_num_bytes, (
int) num_bytes
261 Kokkos::atomic_compare_exchange(error_code.data(), 0, 21);
265 if (offset > buf_size || offset + num_bytes > buf_size)
268 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
269 "At row %d, the offset (%d) > buffer size (%d)\n",
270 (
int) lid_no, (
int) offset, (
int) buf_size
273 Kokkos::atomic_compare_exchange(error_code.data(), 0, 22);
278 size_t num_entries_in_batch = 0;
279 if (num_entries_in_row <= batch_size)
280 num_entries_in_batch = num_entries_in_row;
281 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
282 num_entries_in_batch = batch_size;
284 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
287 const size_t num_ent_start = offset;
288 const size_t num_ent_end = num_ent_start + bytes_per_lid;
291 const size_t gids_start = num_ent_end;
292 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
294 const size_t vals_start = gids_end;
296 const size_t shift = batch_no * batch_size;
297 const char*
const num_ent_in = imports.data() + num_ent_start;
298 const char*
const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
299 const char*
const vals_in = imports.data() + vals_start + shift * bytes_per_value;
303 if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
306 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
307 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
308 (
int) lid_no, (
int) num_entries_in_row, (
int) num_ent_out
311 Kokkos::atomic_compare_exchange(error_code.data(), 0, 23);
314 constexpr
bool matrix_has_sorted_rows =
true;
317 Kokkos::parallel_for(
318 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
319 [=, *
this](
const LO& j)
324 distance = j * bytes_per_gid;
334 distance = j * bytes_per_value;
337 if (combine_mode ==
ADD) {
341 const bool use_atomic_updates = atomic;
342 (void)local_matrix.sumIntoValues(
347 matrix_has_sorted_rows,
350 }
else if (combine_mode ==
REPLACE) {
354 const bool use_atomic_updates =
false;
355 (void)local_matrix.replaceValues(
360 matrix_has_sorted_rows,
366 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
367 "At row %d, an unknown error occurred during unpack\n", (
int) lid_no
369 Kokkos::atomic_compare_exchange(error_code.data(), 0, 31);
374 team_member.team_barrier();
380 auto error_code_h = Kokkos::create_mirror_view_and_copy(
381 Kokkos::HostSpace(), error_code
383 return error_code_h();
388 struct MaxNumEntTag {};
389 struct TotNumEntTag {};
399 template<
class LO,
class DT,
class BDT>
402 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
403 typedef Kokkos::View<const size_t*, DT> offsets_type;
404 typedef Kokkos::View<const char*, BDT> input_buffer_type;
407 typedef size_t value_type;
410 num_packets_per_lid_type num_packets_per_lid;
411 offsets_type offsets;
412 input_buffer_type imports;
416 const offsets_type& offsets_in,
417 const input_buffer_type& imports_in) :
418 num_packets_per_lid (num_packets_per_lid_in),
419 offsets (offsets_in),
423 KOKKOS_INLINE_FUNCTION
void
424 operator() (
const MaxNumEntTag,
const LO i, value_type& update)
const {
426 const size_t num_bytes = num_packets_per_lid(i);
429 const char*
const in_buf = imports.data () + offsets(i);
431 const size_t num_ent =
static_cast<size_t> (num_ent_LO);
433 update = (update < num_ent) ? num_ent : update;
437 KOKKOS_INLINE_FUNCTION
void
438 join (
const MaxNumEntTag,
440 const value_type& src)
const
442 if (dst < src) dst = src;
445 KOKKOS_INLINE_FUNCTION
void
446 operator() (
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
448 const size_t num_bytes = num_packets_per_lid(i);
451 const char*
const in_buf = imports.data () + offsets(i);
453 tot_num_ent +=
static_cast<size_t> (num_ent_LO);
465 template<
class LO,
class DT,
class BDT>
467 compute_maximum_num_entries (
468 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
469 const Kokkos::View<const size_t*, DT>& offsets,
470 const Kokkos::View<const char*, BDT>& imports)
472 typedef typename DT::execution_space XS;
473 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
474 MaxNumEntTag> range_policy;
478 const LO numRowsToUnpack =
479 static_cast<LO
> (num_packets_per_lid.extent (0));
480 size_t max_num_ent = 0;
481 Kokkos::parallel_reduce (
"Max num entries in CRS",
482 range_policy (0, numRowsToUnpack),
483 functor, max_num_ent);
494 template<
class LO,
class DT,
class BDT>
496 compute_total_num_entries (
497 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
498 const Kokkos::View<const size_t*, DT>& offsets,
499 const Kokkos::View<const char*, BDT>& imports)
501 typedef typename DT::execution_space XS;
502 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
503 size_t tot_num_ent = 0;
504 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
506 const LO numRowsToUnpack =
507 static_cast<LO
> (num_packets_per_lid.extent (0));
508 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
509 range_policy (0, numRowsToUnpack),
510 functor, tot_num_ent);
515 KOKKOS_INLINE_FUNCTION
517 unpackRowCount(
const char imports[],
519 const size_t num_bytes)
521 using PT = PackTraits<LO>;
525 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
526 if (p_num_bytes > num_bytes) {
527 return OrdinalTraits<size_t>::invalid();
529 const char*
const in_buf = imports + offset;
530 (void) PT::unpackValue(num_ent_LO, in_buf);
532 return static_cast<size_t>(num_ent_LO);
539 template<
class View1,
class View2>
543 const View1& batches_per_lid,
547 using LO =
typename View2::value_type;
549 for (
size_t i=0; i<batches_per_lid.extent(0); i++)
551 for (
size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
553 batch_info(batch, 0) =
static_cast<LO
>(i);
554 batch_info(batch, 1) = batch_no;
558 return batch == batch_info.extent(0);
568 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
570 unpackAndCombineIntoCrsMatrix(
571 const LocalMatrix& local_matrix,
572 const LocalMap& local_map,
573 const Kokkos::View<const char*, BufferDeviceType>& imports,
574 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
575 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
578 using ST =
typename LocalMatrix::value_type;
581 using XS =
typename DT::execution_space;
582 const char prefix[] =
583 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
584 "unpackAndCombineIntoCrsMatrix: ";
586 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
587 if (num_import_lids == 0) {
594 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
595 std::invalid_argument,
596 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
597 "static graph (i.e., was constructed with the CrsMatrix constructor "
598 "that takes a const CrsGraph pointer).");
600 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
601 std::invalid_argument,
602 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
603 "(i.e., was constructed with the CrsMatrix constructor that takes a "
604 "const CrsGraph pointer).");
607 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
608 std::invalid_argument,
609 prefix <<
"Invalid combine mode; should never get "
610 "here! Please report this bug to the Tpetra developers.");
613 bool bad_num_import_lids =
614 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
615 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
616 std::invalid_argument,
617 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
618 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
622 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
626 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
628 const size_t batch_size = std::min(default_batch_size, max_num_ent);
631 size_t num_batches = 0;
632 Kokkos::View<LO*[2], DT> batch_info(
"", num_batches);
633 Kokkos::View<size_t*, DT> batches_per_lid(
"", num_import_lids);
635 Kokkos::parallel_reduce(
636 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
637 KOKKOS_LAMBDA(
const size_t i,
size_t& batches)
639 const size_t num_entries_in_row = unpackRowCount<LO>(
640 imports.data(), offsets(i), num_packets_per_lid(i)
643 (num_entries_in_row <= batch_size) ?
645 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
646 batches += batches_per_lid(i);
650 Kokkos::resize(batch_info, num_batches);
652 Kokkos::HostSpace host_space;
653 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
657 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
659 (void) compute_batch_info(batches_per_lid_h, batch_info_h);
668 const bool atomic = XS().concurrency() != 1;
669 using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
684 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
686 if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
688 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
692 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
695 auto error_code = f.error();
696 TEUCHOS_TEST_FOR_EXCEPTION(
699 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
703 template<
class LocalMatrix,
class BufferDeviceType>
706 const LocalMatrix& local_matrix,
707 const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
708 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
709 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
710 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
712 const Kokkos::View<const char*, BufferDeviceType>& imports,
713 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
715 const size_t num_same_ids)
717 using Kokkos::parallel_reduce;
718 typedef typename LocalMatrix::ordinal_type LO;
719 typedef typename LocalMatrix::device_type device_type;
720 typedef typename device_type::execution_space XS;
721 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
722 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
723 typedef BufferDeviceType BDT;
729 num_items =
static_cast<LO
>(num_same_ids);
732 parallel_reduce(range_policy(0, num_items),
733 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
734 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid+1]
735 -local_matrix.graph.row_map[lid]);
741 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
744 parallel_reduce(range_policy(0, num_items),
745 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
746 const LO lid = permute_from_lids(i);
747 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
748 - local_matrix.graph.row_map[lid]);
755 const size_type np = num_packets_per_lid.extent(0);
756 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
759 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
767 template<
class LO,
class DT,
class BDT>
769 setupRowPointersForRemotes(
770 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
772 const Kokkos::View<const char*, BDT>& imports,
773 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
774 const typename PackTraits<size_t>::input_array_type& offsets)
776 using Kokkos::parallel_reduce;
777 typedef typename DT::execution_space XS;
778 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
779 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
781 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
782 const size_type N = num_packets_per_lid.extent(0);
785 parallel_reduce (
"Setup row pointers for remotes",
787 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
788 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
789 const size_t num_bytes = num_packets_per_lid(i);
790 const size_t offset = offsets(i);
791 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
792 if (num_ent == InvalidNum) {
795 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
803 makeCrsRowPtrFromLengths(
804 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
805 const Kokkos::View<size_t*,DT>& new_start_row)
807 using Kokkos::parallel_scan;
808 typedef typename DT::execution_space XS;
809 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
810 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
811 const size_type N = new_start_row.extent(0);
812 parallel_scan(range_policy(0, N),
813 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
814 auto cur_val = tgt_rowptr(i);
816 tgt_rowptr(i) = update;
817 new_start_row(i) = tgt_rowptr(i);
824 template<
class LocalMatrix,
class LocalMap>
827 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
829 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
830 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
831 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
833 const LocalMatrix& local_matrix,
834 const LocalMap& local_col_map,
835 const size_t num_same_ids,
838 using Kokkos::parallel_for;
841 typedef typename DT::execution_space XS;
842 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
844 parallel_for(range_policy(0, num_same_ids),
845 KOKKOS_LAMBDA(
const size_t i) {
846 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
848 const LO src_lid =
static_cast<LO
>(i);
849 size_t src_row = local_matrix.graph.row_map(src_lid);
851 const LO tgt_lid =
static_cast<LO
>(i);
852 const size_t tgt_row = tgt_rowptr(tgt_lid);
854 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
855 - local_matrix.graph.row_map(src_lid);
856 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
858 for (
size_t j=local_matrix.graph.row_map(src_lid);
859 j<local_matrix.graph.row_map(src_lid+1); ++j) {
860 LO src_col = local_matrix.graph.entries(j);
861 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
862 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
863 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
869 template<
class LocalMatrix,
class LocalMap>
871 copyDataFromPermuteIDs(
872 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
874 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
875 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
876 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
878 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
879 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
880 const LocalMatrix& local_matrix,
881 const LocalMap& local_col_map,
884 using Kokkos::parallel_for;
887 typedef typename DT::execution_space XS;
888 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
889 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
891 const size_type num_permute_to_lids = permute_to_lids.extent(0);
893 parallel_for(range_policy(0, num_permute_to_lids),
894 KOKKOS_LAMBDA(
const size_t i) {
895 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
897 const LO src_lid = permute_from_lids(i);
898 const size_t src_row = local_matrix.graph.row_map(src_lid);
900 const LO tgt_lid = permute_to_lids(i);
901 const size_t tgt_row = tgt_rowptr(tgt_lid);
903 size_t nsr = local_matrix.graph.row_map(src_lid+1)
904 - local_matrix.graph.row_map(src_lid);
905 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
907 for (
size_t j=local_matrix.graph.row_map(src_lid);
908 j<local_matrix.graph.row_map(src_lid+1); ++j) {
909 LO src_col = local_matrix.graph.entries(j);
910 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
911 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
912 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
918 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
920 unpackAndCombineIntoCrsArrays2(
921 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
923 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
924 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
925 const typename PackTraits<size_t>::input_array_type& offsets,
926 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
927 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
928 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
929 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
931 const Kokkos::View<const char*, BufferDeviceType>& imports,
932 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
937 const size_t bytes_per_value)
940 using Kokkos::subview;
941 using Kokkos::MemoryUnmanaged;
942 using Kokkos::parallel_reduce;
943 using Kokkos::atomic_fetch_add;
948 typedef typename LocalMatrix::value_type ST;
949 typedef typename DT::execution_space XS;
950 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
951 typedef typename Kokkos::pair<size_type, size_type> slice;
952 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
954 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
955 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
956 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
958 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
961 const size_type num_import_lids = import_lids.size();
964 parallel_reduce (
"Unpack and combine into CRS",
965 range_policy (0, num_import_lids),
966 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
967 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
968 const size_t num_bytes = num_packets_per_lid(i);
969 const size_t offset = offsets(i);
970 if (num_bytes == 0) {
974 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
975 if (num_ent == InvalidNum) {
979 const LO lcl_row = import_lids(i);
980 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
981 const size_t end_row = start_row + num_ent;
983 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
984 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
985 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
987 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
988 imports.data(), offset, num_bytes,
989 num_ent, bytes_per_value);
992 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
993 const int pid = pids_out(j);
994 pids_out(j) = (pid != my_pid) ? pid : -1;
1001 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1004 const LocalMatrix & local_matrix,
1005 const LocalMap & local_col_map,
1006 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1007 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1008 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1009 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1011 const Kokkos::View<const char*, BufferDeviceType>& imports,
1012 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1014 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1015 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1016 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1017 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1018 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1021 const size_t num_same_ids,
1022 const size_t tgt_num_rows,
1023 const size_t tgt_num_nonzeros,
1024 const int my_tgt_pid,
1025 const size_t bytes_per_value)
1028 using Kokkos::subview;
1029 using Kokkos::parallel_for;
1030 using Kokkos::MemoryUnmanaged;
1034 typedef typename DT::execution_space XS;
1035 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1036 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1037 typedef BufferDeviceType BDT;
1039 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1041 const size_t N = tgt_num_rows;
1045 const int my_pid = my_tgt_pid;
1048 parallel_for(range_policy(0, N+1),
1049 KOKKOS_LAMBDA(
const size_t i) {
1055 parallel_for(range_policy(0, num_same_ids),
1056 KOKKOS_LAMBDA(
const size_t i) {
1057 const LO tgt_lid =
static_cast<LO
>(i);
1058 const LO src_lid =
static_cast<LO
>(i);
1059 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1060 - local_matrix.graph.row_map(src_lid);
1065 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1066 parallel_for(range_policy(0, num_permute_to_lids),
1067 KOKKOS_LAMBDA(
const size_t i) {
1068 const LO tgt_lid = permute_to_lids(i);
1069 const LO src_lid = permute_from_lids(i);
1070 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1071 - local_matrix.graph.row_map(src_lid);
1076 const size_type num_import_lids = import_lids.extent(0);
1077 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1080 #ifdef HAVE_TPETRA_DEBUG
1082 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1083 const bool condition =
1084 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1085 TEUCHOS_TEST_FOR_EXCEPTION
1086 (condition, std::logic_error, prefix
1087 <<
"The final offset in bytes " << nth_offset_h
1088 <<
" != imports.size() = " << imports.extent(0)
1089 <<
". Please report this bug to the Tpetra developers.");
1091 #endif // HAVE_TPETRA_DEBUG
1095 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1096 import_lids, imports, num_packets_per_lid, offsets);
1097 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1098 <<
" Error transferring data to target row pointers. "
1099 "Please report this bug to the Tpetra developers.");
1103 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1106 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1109 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1110 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1112 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1113 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1114 local_matrix, local_col_map, my_pid);
1116 if (imports.extent(0) <= 0) {
1120 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1121 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1122 local_matrix, local_col_map, my_pid, bytes_per_value);
1123 TEUCHOS_TEST_FOR_EXCEPTION(
1124 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1125 "should never happen. Please report this bug to the Tpetra developers.");
1171 template<
typename ST,
typename LO,
typename GO,
typename Node>
1175 const Teuchos::ArrayView<const char>& imports,
1176 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1177 const Teuchos::ArrayView<const LO>& importLIDs,
1182 typedef typename Node::device_type device_type;
1184 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1185 "Node::device_type and LocalMatrix::device_type must be the same.");
1188 device_type outputDevice;
1193 auto num_packets_per_lid_d =
1195 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1197 auto import_lids_d =
1199 importLIDs.size(),
true,
"import_lids");
1203 imports.size(),
true,
"imports");
1206 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1217 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1218 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1219 import_lids_d, combineMode);
1223 template<
typename ST,
typename LO,
typename GO,
typename NT>
1225 unpackCrsMatrixAndCombineNew(
1227 Kokkos::DualView<
char*,
1229 Kokkos::DualView<
size_t*,
1231 const Kokkos::DualView<
const LO*,
1239 using device_type =
typename crs_matrix_type::device_type;
1240 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1241 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1244 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1245 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1246 "must be the same.");
1248 if (numPacketsPerLID.need_sync_device()) {
1249 numPacketsPerLID.sync_device ();
1251 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1253 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1254 auto import_lids_d = importLIDs.view_device ();
1256 if (imports.need_sync_device()) {
1257 imports.sync_device ();
1259 auto imports_d = imports.view_device ();
1262 auto local_col_map = sourceMatrix.
getColMap ()->getLocalMap ();
1263 typedef decltype (local_col_map) local_map_type;
1265 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1266 local_matrix_device_type,
1269 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1270 import_lids_d, combineMode);
1328 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1331 const
CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1332 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1333 const Teuchos::ArrayView<const
char> &imports,
1334 const Teuchos::ArrayView<const
size_t>& numPacketsPerLID,
1338 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1339 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1341 using Kokkos::MemoryUnmanaged;
1343 typedef typename Node::device_type DT;
1344 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1346 TEUCHOS_TEST_FOR_EXCEPTION
1347 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1348 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size () <<
" != "
1349 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1352 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1353 TEUCHOS_TEST_FOR_EXCEPTION
1354 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1355 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1356 TEUCHOS_TEST_FOR_EXCEPTION
1357 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1358 prefix <<
"importLIDs.size() = " << importLIDs.size () <<
" != "
1359 "numPacketsPerLID.size() = " << numPacketsPerLID.size () <<
".");
1361 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1363 using kokkos_device_type = Kokkos::Device<
typename Node::device_type::execution_space,
1364 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1366 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1367 Kokkos::View<LocalOrdinal const *, kokkos_device_type, void, void > permute_from_lids_d =
1369 Kokkos::View<LocalOrdinal const *, kokkos_device_type> permute_from_lids_d =
1372 permuteFromLIDs.getRawPtr (),
1373 permuteFromLIDs.size (),
true,
1374 "permute_from_lids");
1376 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1377 Kokkos::View<const char*, kokkos_device_type, void, void > imports_d =
1379 Kokkos::View<const char*, kokkos_device_type> imports_d =
1382 imports.getRawPtr (),
1383 imports.size (),
true,
1386 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1387 Kokkos::View<const size_t*, kokkos_device_type, void, void > num_packets_per_lid_d =
1389 Kokkos::View<const size_t*, kokkos_device_type> num_packets_per_lid_d =
1392 numPacketsPerLID.getRawPtr (),
1393 numPacketsPerLID.size (),
true,
1394 "num_packets_per_lid");
1397 local_matrix, permute_from_lids_d, imports_d,
1398 num_packets_per_lid_d, numSameIDs);
1416 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1420 const Kokkos::View<LocalOrdinal
const *,
1421 Kokkos::Device<
typename Node::device_type::execution_space,
1422 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1423 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1427 const Kokkos::View<
const char*,
1428 Kokkos::Device<
typename Node::device_type::execution_space,
1429 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1430 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1434 const Kokkos::View<
const size_t*,
1435 Kokkos::Device<
typename Node::device_type::execution_space,
1436 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1437 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1440 > num_packets_per_lid_d,
1441 const size_t numSameIDs,
1442 const Kokkos::View<LocalOrdinal
const *,
1443 Kokkos::Device<
typename Node::device_type::execution_space,
1444 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1445 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1448 > permute_to_lids_d,
1449 const Kokkos::View<LocalOrdinal
const *,
1450 Kokkos::Device<
typename Node::device_type::execution_space,
1451 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1452 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1455 > permute_from_lids_d,
1456 size_t TargetNumRows,
1457 const int MyTargetPID,
1458 Kokkos::View<size_t*,typename Node::device_type> &crs_rowptr_d,
1459 Kokkos::View<GlobalOrdinal*,typename Node::device_type> &crs_colind_d,
1461 const Teuchos::ArrayView<const int>& SourcePids,
1462 Kokkos::View<int*,typename Node::device_type> &TargetPids)
1464 using execution_space =
typename Node::execution_space;
1470 using Teuchos::ArrayView;
1471 using Teuchos::outArg;
1472 using Teuchos::REDUCE_MAX;
1473 using Teuchos::reduceAll;
1475 typedef typename Node::device_type DT;
1478 typedef typename matrix_type::impl_scalar_type ST;
1480 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1481 # ifdef HAVE_TPETRA_MMM_TIMINGS
1482 using Teuchos::TimeMonitor;
1483 Teuchos::RCP<TimeMonitor> tm;
1486 using Kokkos::MemoryUnmanaged;
1488 TEUCHOS_TEST_FOR_EXCEPTION
1489 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1490 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size () <<
" != "
1491 "permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1495 TEUCHOS_TEST_FOR_EXCEPTION
1496 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1497 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1498 TEUCHOS_TEST_FOR_EXCEPTION
1499 (((
size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1500 prefix <<
"import_lids_d.size() = " << import_lids_d.size () <<
" != "
1501 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () <<
".");
1506 # ifdef HAVE_TPETRA_MMM_TIMINGS
1507 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1509 size_t TargetNumNonzeros =
1511 local_matrix, permute_from_lids_d, imports_d,
1512 num_packets_per_lid_d, numSameIDs);
1513 # ifdef HAVE_TPETRA_MMM_TIMINGS
1517 # ifdef HAVE_TPETRA_MMM_TIMINGS
1518 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1520 Kokkos::resize(crs_rowptr_d,TargetNumRows+1);
1521 Kokkos::resize(crs_colind_d,TargetNumNonzeros);
1522 Kokkos::resize(crs_vals_d,TargetNumNonzeros);
1523 # ifdef HAVE_TPETRA_MMM_TIMINGS
1527 TEUCHOS_TEST_FOR_EXCEPTION(
1528 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1529 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size ()
1530 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size () <<
".");
1532 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1533 Kokkos::resize(TargetPids,TargetNumNonzeros);
1538 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1540 # ifdef HAVE_TPETRA_MMM_TIMINGS
1541 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1548 SourcePids.size(),
true,
"src_pids");
1550 # ifdef HAVE_TPETRA_MMM_TIMINGS
1554 size_t bytes_per_value = 0;
1568 size_t bytes_per_value_l = 0;
1569 if (local_matrix.values.extent(0) > 0) {
1570 const ST& val = local_matrix.values(0);
1573 const ST& val = crs_vals_d(0);
1576 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1577 Teuchos::REDUCE_MAX,
1579 outArg(bytes_per_value));
1582 # ifdef HAVE_TPETRA_MMM_TIMINGS
1583 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1586 local_matrix, local_col_map, import_lids_d, imports_d,
1587 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1588 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1589 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1591 # ifdef HAVE_TPETRA_MMM_TIMINGS
1596 # ifdef HAVE_TPETRA_MMM_TIMINGS
1597 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1600 Kokkos::parallel_for(
"setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,TargetPids.size()), KOKKOS_LAMBDA (
const size_t i) {
1601 if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1606 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1610 const Kokkos::View<LocalOrdinal
const *,
1611 Kokkos::Device<
typename Node::device_type::execution_space,
1612 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1613 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1617 const Kokkos::View<
const char*,
1618 Kokkos::Device<
typename Node::device_type::execution_space,
1619 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1620 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1624 const Kokkos::View<
const size_t*,
1625 Kokkos::Device<
typename Node::device_type::execution_space,
1626 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1627 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1630 > num_packets_per_lid_d,
1631 const size_t numSameIDs,
1632 const Kokkos::View<LocalOrdinal
const *,
1633 Kokkos::Device<
typename Node::device_type::execution_space,
1634 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1635 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1638 > permute_to_lids_d,
1639 const Kokkos::View<LocalOrdinal
const *,
1640 Kokkos::Device<
typename Node::device_type::execution_space,
1641 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1642 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1645 > permute_from_lids_d,
1646 size_t TargetNumRows,
1647 const int MyTargetPID,
1648 Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1649 Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1650 Teuchos::ArrayRCP<Scalar>& CRS_vals,
1651 const Teuchos::ArrayView<const int>& SourcePids,
1652 Teuchos::Array<int>& TargetPids)
1654 using execution_space =
typename Node::execution_space;
1660 using Teuchos::ArrayView;
1661 using Teuchos::outArg;
1662 using Teuchos::REDUCE_MAX;
1663 using Teuchos::reduceAll;
1665 typedef typename Node::device_type DT;
1668 typedef typename matrix_type::impl_scalar_type ST;
1670 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1671 # ifdef HAVE_TPETRA_MMM_TIMINGS
1672 using Teuchos::TimeMonitor;
1673 Teuchos::RCP<TimeMonitor> tm;
1676 using Kokkos::MemoryUnmanaged;
1678 TEUCHOS_TEST_FOR_EXCEPTION
1679 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1680 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size () <<
" != "
1681 "permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1685 TEUCHOS_TEST_FOR_EXCEPTION
1686 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1687 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1688 TEUCHOS_TEST_FOR_EXCEPTION
1689 (((
size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1690 prefix <<
"import_lids_d.size() = " << import_lids_d.size () <<
" != "
1691 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () <<
".");
1696 # ifdef HAVE_TPETRA_MMM_TIMINGS
1697 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1699 size_t TargetNumNonzeros =
1701 local_matrix, permute_from_lids_d, imports_d,
1702 num_packets_per_lid_d, numSameIDs);
1703 # ifdef HAVE_TPETRA_MMM_TIMINGS
1707 # ifdef HAVE_TPETRA_MMM_TIMINGS
1708 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1710 CRS_rowptr.resize (TargetNumRows+1);
1711 CRS_colind.resize(TargetNumNonzeros);
1712 CRS_vals.resize(TargetNumNonzeros);
1713 Teuchos::ArrayRCP<ST>
const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1714 # ifdef HAVE_TPETRA_MMM_TIMINGS
1718 TEUCHOS_TEST_FOR_EXCEPTION(
1719 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1720 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size ()
1721 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size () <<
".");
1724 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1725 TargetPids.resize (TargetNumNonzeros);
1727 TargetPids.assign (TargetNumNonzeros, -1);
1730 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1732 # ifdef HAVE_TPETRA_MMM_TIMINGS
1733 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1740 CRS_rowptr.size(),
true,
"crs_rowptr");
1744 CRS_colind.size(),
true,
"crs_colidx");
1745 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1746 static_assert (! std::is_same<
1747 typename std::remove_const<
1748 typename std::decay<
1749 decltype (CRS_vals_impl_scalar_type)
1752 std::complex<double> >::value,
1753 "CRS_vals::value_type is std::complex<double>; this should never happen"
1754 ", since std::complex does not work in Kokkos::View objects.");
1755 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1759 CRS_vals_impl_scalar_type.size(),
true,
"crs_vals");
1761 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1762 static_assert (! std::is_same<
1763 typename decltype (crs_vals_d)::non_const_value_type,
1764 std::complex<double> >::value,
1765 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1766 "never happen, since std::complex does not work in Kokkos::View objects.");
1767 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1771 SourcePids.size(),
true,
"src_pids");
1775 TargetPids.size(),
true,
"tgt_pids");
1777 # ifdef HAVE_TPETRA_MMM_TIMINGS
1781 size_t bytes_per_value = 0;
1795 size_t bytes_per_value_l = 0;
1796 if (local_matrix.values.extent(0) > 0) {
1797 const ST& val = local_matrix.values(0);
1800 const ST& val = crs_vals_d(0);
1803 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1804 Teuchos::REDUCE_MAX,
1806 outArg(bytes_per_value));
1809 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1810 static_assert (! std::is_same<
1811 typename decltype (crs_vals_d)::non_const_value_type,
1812 std::complex<double> >::value,
1813 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1814 "never happen, since std::complex does not work in Kokkos::View objects.");
1815 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1817 # ifdef HAVE_TPETRA_MMM_TIMINGS
1818 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1821 local_matrix, local_col_map, import_lids_d, imports_d,
1822 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1823 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1824 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1826 # ifdef HAVE_TPETRA_MMM_TIMINGS
1831 # ifdef HAVE_TPETRA_MMM_TIMINGS
1832 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1834 typename decltype(crs_rowptr_d)::host_mirror_type crs_rowptr_h(
1835 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1837 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1839 typename decltype(crs_colind_d)::host_mirror_type crs_colind_h(
1840 CRS_colind.getRawPtr(), CRS_colind.size());
1842 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1844 typename decltype(crs_vals_d)::host_mirror_type crs_vals_h(
1845 CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1847 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1849 typename decltype(tgt_pids_d)::host_mirror_type tgt_pids_h(
1850 TargetPids.getRawPtr(), TargetPids.size());
1852 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1860 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON( ST, LO, GO, NT ) \
1862 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1863 const CrsMatrix<ST, LO, GO, NT>&, \
1864 const Teuchos::ArrayView<const char>&, \
1865 const Teuchos::ArrayView<const size_t>&, \
1866 const Teuchos::ArrayView<const LO>&, \
1870 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1871 const CrsMatrix<ST, LO, GO, NT> &, \
1872 const Teuchos::ArrayView<const LO> &, \
1873 const Teuchos::ArrayView<const char> &, \
1874 const Teuchos::ArrayView<const size_t>&, \
1878 const Teuchos::ArrayView<const LO>&, \
1879 const Teuchos::ArrayView<const LO>&); \
1881 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1882 const CrsMatrix<ST, LO, GO, NT>&, \
1883 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1884 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1885 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1887 const CombineMode); \
1889 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1890 const CrsMatrix<ST, LO, GO, NT> &, \
1891 const Kokkos::View<LO const *, \
1892 Kokkos::Device<typename NT::device_type::execution_space, \
1893 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1895 const Kokkos::View<const char*, \
1896 Kokkos::Device<typename NT::device_type::execution_space, \
1897 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1899 const Kokkos::View<const size_t*, \
1900 Kokkos::Device<typename NT::device_type::execution_space, \
1901 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1904 const Kokkos::View<LO const *, \
1905 Kokkos::Device<typename NT::device_type::execution_space, \
1906 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1908 const Kokkos::View<LO const *, \
1909 Kokkos::Device<typename NT::device_type::execution_space, \
1910 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1914 Kokkos::View<size_t*,typename NT::device_type>&, \
1915 Kokkos::View<GO*,typename NT::device_type>&, \
1916 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1917 const Teuchos::ArrayView<const int>&, \
1918 Kokkos::View<int*,typename NT::device_type>&); \
1920 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1921 const CrsMatrix<ST, LO, GO, NT> &, \
1922 const Kokkos::View<LO const *, \
1923 Kokkos::Device<typename NT::device_type::execution_space, \
1924 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1926 const Kokkos::View<const char*, \
1927 Kokkos::Device<typename NT::device_type::execution_space, \
1928 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1930 const Kokkos::View<const size_t*, \
1931 Kokkos::Device<typename NT::device_type::execution_space, \
1932 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1935 const Kokkos::View<LO const *, \
1936 Kokkos::Device<typename NT::device_type::execution_space, \
1937 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1939 const Kokkos::View<LO const *, \
1940 Kokkos::Device<typename NT::device_type::execution_space, \
1941 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1945 Teuchos::ArrayRCP<size_t>&, \
1946 Teuchos::ArrayRCP<GO>&, \
1947 Teuchos::ArrayRCP<ST>&, \
1948 const Teuchos::ArrayView<const int>&, \
1949 Teuchos::Array<int>&);
1951 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF( ST, LO, GO, NT ) \
1953 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1954 const CrsMatrix<ST, LO, GO, NT>&, \
1955 const Teuchos::ArrayView<const char>&, \
1956 const Teuchos::ArrayView<const size_t>&, \
1957 const Teuchos::ArrayView<const LO>&, \
1961 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1962 const CrsMatrix<ST, LO, GO, NT> &, \
1963 const Teuchos::ArrayView<const LO> &, \
1964 const Teuchos::ArrayView<const char> &, \
1965 const Teuchos::ArrayView<const size_t>&, \
1969 const Teuchos::ArrayView<const LO>&, \
1970 const Teuchos::ArrayView<const LO>&); \
1972 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1973 const CrsMatrix<ST, LO, GO, NT>&, \
1974 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1975 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1976 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1978 const CombineMode); \
1980 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1981 const CrsMatrix<ST, LO, GO, NT> &, \
1982 const Kokkos::View<LO const *, \
1983 Kokkos::Device<typename NT::device_type::execution_space, \
1984 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1985 const Kokkos::View<const char*, \
1986 Kokkos::Device<typename NT::device_type::execution_space, \
1987 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1988 const Kokkos::View<const size_t*, \
1989 Kokkos::Device<typename NT::device_type::execution_space, \
1990 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1992 const Kokkos::View<LO const *, \
1993 Kokkos::Device<typename NT::device_type::execution_space, \
1994 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1995 const Kokkos::View<LO const *, \
1996 Kokkos::Device<typename NT::device_type::execution_space, \
1997 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2000 Kokkos::View<size_t*,typename NT::device_type>&, \
2001 Kokkos::View<GO*,typename NT::device_type>&, \
2002 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
2003 const Teuchos::ArrayView<const int>&, \
2004 Kokkos::View<int*,typename NT::device_type>&); \
2006 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
2007 const CrsMatrix<ST, LO, GO, NT> &, \
2008 const Kokkos::View<LO const *, \
2009 Kokkos::Device<typename NT::device_type::execution_space, \
2010 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2011 const Kokkos::View<const char*, \
2012 Kokkos::Device<typename NT::device_type::execution_space, \
2013 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2014 const Kokkos::View<const size_t*, \
2015 Kokkos::Device<typename NT::device_type::execution_space, \
2016 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2018 const Kokkos::View<LO const *, \
2019 Kokkos::Device<typename NT::device_type::execution_space, \
2020 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2021 const Kokkos::View<LO const *, \
2022 Kokkos::Device<typename NT::device_type::execution_space, \
2023 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
2026 Teuchos::ArrayRCP<size_t>&, \
2027 Teuchos::ArrayRCP<GO>&, \
2028 Teuchos::ArrayRCP<ST>&, \
2029 const Teuchos::ArrayView<const int>&, \
2030 Teuchos::Array<int>&);
2032 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
2033 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
2034 TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON( ST, LO, GO, NT )
2036 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
2037 TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF( ST, LO, GO, NT )
2040 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
"Local" part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don't currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
int error() const
Host function for getting the error.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
Unpacks and combines a single row of the CrsMatrix.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...