10#ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
15#include "TpetraCore_config.h"
16#include "Kokkos_Core.hpp"
17#include "Teuchos_Array.hpp"
18#include "Teuchos_ArrayView.hpp"
19#include "Teuchos_OrdinalTraits.hpp"
20#include "Teuchos_TimeMonitor.hpp"
28#include "Tpetra_Details_DefaultTypes.hpp"
59namespace UnpackAndCombineCrsMatrixImpl {
70template<
class ST,
class LO,
class GO>
79 const size_t bytes_per_value)
85 bool unpack_pids =
pids_out.size() > 0;
95 const size_t pids_len = unpack_pids ?
104 const char*
const pids_in = unpack_pids ? imports +
pids_beg :
nullptr;
115 Kokkos::pair<int, size_t>
p;
154template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
159 typedef typename local_matrix_type::value_type ST;
163 typedef typename DT::execution_space XS;
165 typedef Kokkos::View<const size_t*, BufferDeviceType>
166 num_packets_per_lid_type;
167 typedef Kokkos::View<const size_t*, DT> offsets_type;
168 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
169 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
171 typedef Kokkos::View<int, DT> error_type;
172 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
174 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
175 "LocalMap::local_ordinal_type and "
176 "LocalMatrix::ordinal_type must be the same.");
180 input_buffer_type imports;
181 num_packets_per_lid_type num_packets_per_lid;
182 import_lids_type import_lids;
183 Kokkos::View<const LO*[2], DT> batch_info;
184 offsets_type offsets;
187 size_t bytes_per_value;
189 error_type error_code;
218 void operator()(member_type team_member)
const
221 using Kokkos::subview;
222 using Kokkos::MemoryUnmanaged;
224 const LO
batch = team_member.league_rank();
236 const size_t buf_size = imports.size();
256#ifndef KOKKOS_ENABLE_SYCL
258 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
259 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
263 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
270#ifndef KOKKOS_ENABLE_SYCL
272 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
273 "At row %d, the offset (%d) > buffer size (%d)\n",
277 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
310#ifndef KOKKOS_ENABLE_SYCL
312 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
313 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
317 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
323 Kokkos::parallel_for(
343 if (combine_mode ==
ADD) {
348 (
void)local_matrix.sumIntoValues(
356 }
else if (combine_mode ==
REPLACE) {
361 (
void)local_matrix.replaceValues(
372#ifndef KOKKOS_ENABLE_SYCL
374 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
375 "At row %d, an unknown error occurred during unpack\n", (
int)
lid_no
378 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
383 team_member.team_barrier();
389 auto error_code_h = Kokkos::create_mirror_view_and_copy(
390 Kokkos::HostSpace(), error_code
397struct MaxNumEntTag {};
398struct TotNumEntTag {};
408template<
class LO,
class DT,
class BDT>
411 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
412 typedef Kokkos::View<const size_t*, DT> offsets_type;
413 typedef Kokkos::View<const char*, BDT> input_buffer_type;
419 num_packets_per_lid_type num_packets_per_lid;
420 offsets_type offsets;
421 input_buffer_type imports;
435 const size_t num_bytes = num_packets_per_lid(
i);
438 const char*
const in_buf = imports.data () + offsets(
i);
447 join (
const MaxNumEntTag,
451 if (dst < src) dst = src;
457 const size_t num_bytes = num_packets_per_lid(
i);
460 const char*
const in_buf = imports.data () + offsets(
i);
474template<
class LO,
class DT,
class BDT>
477 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
478 const Kokkos::View<const size_t*, DT>& offsets,
479 const Kokkos::View<const char*, BDT>& imports)
481 typedef typename DT::execution_space XS;
482 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
488 static_cast<LO
> (num_packets_per_lid.extent (0));
489 size_t max_num_ent = 0;
490 Kokkos::parallel_reduce (
"Max num entries in CRS",
503template<
class LO,
class DT,
class BDT>
506 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
507 const Kokkos::View<const size_t*, DT>& offsets,
508 const Kokkos::View<const char*, BDT>& imports)
510 typedef typename DT::execution_space XS;
511 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag>
range_policy;
516 static_cast<LO
> (num_packets_per_lid.extent (0));
517 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
526unpackRowCount(
const char imports[],
541 return static_cast<size_t>(num_ent_LO);
548template<
class View1,
class View2>
556 using LO =
typename View2::value_type;
562 batch_info(
batch, 0) =
static_cast<LO
>(
i);
567 return batch == batch_info.extent(0);
577template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
582 const Kokkos::View<const char*, BufferDeviceType>& imports,
583 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
587 using ST =
typename LocalMatrix::value_type;
590 using XS =
typename DT::execution_space;
592 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
593 "unpackAndCombineIntoCrsMatrix: ";
595 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
604 std::invalid_argument,
605 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
606 "static graph (i.e., was constructed with the CrsMatrix constructor "
607 "that takes a const CrsGraph pointer).");
610 std::invalid_argument,
611 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
612 "(i.e., was constructed with the CrsMatrix constructor that takes a "
613 "const CrsGraph pointer).");
617 std::invalid_argument,
618 prefix <<
"Invalid combine mode; should never get "
619 "here! Please report this bug to the Tpetra developers.");
625 std::invalid_argument,
627 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
641 Kokkos::View<LO*[2], DT> batch_info(
"",
num_batches);
644 Kokkos::parallel_reduce(
645 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0,
num_import_lids),
649 imports.data(), offsets(
i), num_packets_per_lid(
i)
677 const bool atomic = XS().concurrency() != 1;
693 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
695 if (!Spaces::is_gpu_exec_space<XS>() ||
team_size == Teuchos::OrdinalTraits<size_t>::invalid())
704 auto error_code =
f.error();
708 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
712template<
class LocalMatrix,
class BufferDeviceType>
717 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
718 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
721 using Kokkos::parallel_reduce;
722 typedef typename LocalMatrix::ordinal_type LO;
723 typedef typename LocalMatrix::device_type device_type;
724 typedef typename device_type::execution_space XS;
725 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
726 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> >
range_policy;
738 update +=
static_cast<size_t>(local_matrix.graph.row_map[
lid+1]
739 -local_matrix.graph.row_map[
lid]);
745 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
748 parallel_reduce(range_policy(0, num_items),
749 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
750 const LO lid = permute_from_lids(i);
751 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
752 - local_matrix.graph.row_map[lid]);
759 const size_type np = num_packets_per_lid.extent(0);
760 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
763 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
771template<
class LO,
class DT,
class BDT>
773setupRowPointersForRemotes(
776 const Kokkos::View<const char*, BDT>& imports,
777 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
780 using Kokkos::parallel_reduce;
781 typedef typename DT::execution_space XS;
783 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> >
range_policy;
786 const size_type
N = num_packets_per_lid.extent(0);
793 const size_t num_bytes = num_packets_per_lid(
i);
794 const size_t offset = offsets(
i);
807makeCrsRowPtrFromLengths(
811 using Kokkos::parallel_scan;
812 typedef typename DT::execution_space XS;
813 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
814 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> >
range_policy;
817 KOKKOS_LAMBDA(
const size_t&
i,
size_t& update,
const bool&
final) {
828template<
class LocalMatrix,
class LocalMap>
832 const typename PackTraits<int>::output_array_type& tgt_pids,
834 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
836 const typename PackTraits<int>::input_array_type& src_pids,
837 const LocalMatrix& local_matrix,
838 const LocalMap& local_col_map,
839 const size_t num_same_ids,
842 using Kokkos::parallel_for;
845 typedef typename DT::execution_space XS;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
848 parallel_for(range_policy(0, num_same_ids),
849 KOKKOS_LAMBDA(
const size_t i) {
850 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
852 const LO src_lid =
static_cast<LO
>(i);
853 size_t src_row = local_matrix.graph.row_map(src_lid);
855 const LO tgt_lid =
static_cast<LO
>(i);
856 const size_t tgt_row = tgt_rowptr(tgt_lid);
858 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
859 - local_matrix.graph.row_map(src_lid);
860 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
862 for (
size_t j=local_matrix.graph.row_map(src_lid);
863 j<local_matrix.graph.row_map(src_lid+1); ++j) {
864 LO src_col = local_matrix.graph.entries(j);
865 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
866 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
867 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
873template<
class LocalMatrix,
class LocalMap>
875copyDataFromPermuteIDs(
877 const typename PackTraits<int>::output_array_type& tgt_pids,
879 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
881 const typename PackTraits<int>::input_array_type& src_pids,
884 const LocalMatrix& local_matrix,
885 const LocalMap& local_col_map,
888 using Kokkos::parallel_for;
891 typedef typename DT::execution_space XS;
892 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
893 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
895 const size_type num_permute_to_lids = permute_to_lids.extent(0);
897 parallel_for(range_policy(0, num_permute_to_lids),
898 KOKKOS_LAMBDA(
const size_t i) {
899 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
901 const LO src_lid = permute_from_lids(i);
902 const size_t src_row = local_matrix.graph.row_map(src_lid);
904 const LO tgt_lid = permute_to_lids(i);
905 const size_t tgt_row = tgt_rowptr(tgt_lid);
907 size_t nsr = local_matrix.graph.row_map(src_lid+1)
908 - local_matrix.graph.row_map(src_lid);
909 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
911 for (
size_t j=local_matrix.graph.row_map(src_lid);
912 j<local_matrix.graph.row_map(src_lid+1); ++j) {
913 LO src_col = local_matrix.graph.entries(j);
914 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
915 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
916 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
922template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
924unpackAndCombineIntoCrsArrays2(
926 const typename PackTraits<int>::output_array_type& tgt_pids,
928 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
931 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
932 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
936 const size_t bytes_per_value)
939 using Kokkos::subview;
940 using Kokkos::MemoryUnmanaged;
941 using Kokkos::parallel_reduce;
942 using Kokkos::atomic_fetch_add;
947 typedef typename LocalMatrix::value_type ST;
948 typedef typename DT::execution_space XS;
949 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
950 typedef typename Kokkos::pair<size_type, size_type> slice;
951 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
953 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
954 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
955 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
957 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
960 const size_type num_import_lids = import_lids.size();
963 parallel_reduce (
"Unpack and combine into CRS",
964 range_policy (0, num_import_lids),
965 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
966 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
967 const size_t num_bytes = num_packets_per_lid(i);
968 const size_t offset = offsets(i);
969 if (num_bytes == 0) {
973 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
974 if (num_ent == InvalidNum) {
978 const LO lcl_row = import_lids(i);
979 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
980 const size_t end_row = start_row + num_ent;
982 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
983 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
984 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
986 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
987 imports.data(), offset, num_bytes,
988 num_ent, bytes_per_value);
991 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
992 const int pid = pids_out(j);
993 pids_out(j) = (pid != my_pid) ? pid : -1;
1000template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1003 const LocalMatrix & local_matrix,
1004 const LocalMap & local_col_map,
1006 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1007 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1013 const typename PackTraits<int>::input_array_type& src_pids,
1014 const typename PackTraits<int>::output_array_type& tgt_pids,
1015 const size_t num_same_ids,
1016 const size_t tgt_num_rows,
1017 const size_t tgt_num_nonzeros,
1018 const int my_tgt_pid,
1019 const size_t bytes_per_value)
1022 using Kokkos::subview;
1023 using Kokkos::parallel_for;
1024 using Kokkos::MemoryUnmanaged;
1028 typedef typename DT::execution_space XS;
1029 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1030 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1031 typedef BufferDeviceType BDT;
1033 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1035 const size_t N = tgt_num_rows;
1039 const int my_pid = my_tgt_pid;
1042 parallel_for(range_policy(0, N+1),
1043 KOKKOS_LAMBDA(
const size_t i) {
1049 parallel_for(range_policy(0, num_same_ids),
1050 KOKKOS_LAMBDA(
const size_t i) {
1051 const LO tgt_lid =
static_cast<LO
>(i);
1052 const LO src_lid =
static_cast<LO
>(i);
1053 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1054 - local_matrix.graph.row_map(src_lid);
1059 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1060 parallel_for(range_policy(0, num_permute_to_lids),
1061 KOKKOS_LAMBDA(
const size_t i) {
1062 const LO tgt_lid = permute_to_lids(i);
1063 const LO src_lid = permute_from_lids(i);
1064 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1065 - local_matrix.graph.row_map(src_lid);
1070 const size_type num_import_lids = import_lids.extent(0);
1071 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1074#ifdef HAVE_TPETRA_DEBUG
1076 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1077 const bool condition =
1078 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1079 TEUCHOS_TEST_FOR_EXCEPTION
1080 (condition, std::logic_error, prefix
1081 <<
"The final offset in bytes " << nth_offset_h
1082 <<
" != imports.size() = " << imports.extent(0)
1083 <<
". Please report this bug to the Tpetra developers.");
1089 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1090 import_lids, imports, num_packets_per_lid, offsets);
1091 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1092 <<
" Error transferring data to target row pointers. "
1093 "Please report this bug to the Tpetra developers.");
1097 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1100 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1103 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1104 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1106 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1107 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1108 local_matrix, local_col_map, my_pid);
1110 if (imports.extent(0) <= 0) {
1114 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1115 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1116 local_matrix, local_col_map, my_pid, bytes_per_value);
1117 TEUCHOS_TEST_FOR_EXCEPTION(
1118 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1119 "should never happen. Please report this bug to the Tpetra developers.");
1165template<
typename ST,
typename LO,
typename GO,
typename Node>
1169 const Teuchos::ArrayView<const char>& imports,
1171 const Teuchos::ArrayView<const LO>&
importLIDs,
1176 typedef typename Node::device_type device_type;
1178 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1179 "Node::device_type and LocalMatrix::device_type must be the same.");
1197 imports.size(),
true,
"imports");
1199 auto local_matrix =
sourceMatrix.getLocalMatrixDevice();
1200 auto local_col_map =
sourceMatrix.getColMap()->getLocalMap();
1211 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1217template<
typename ST,
typename LO,
typename GO,
typename NT>
1219unpackCrsMatrixAndCombineNew(
1221 Kokkos::DualView<
char*,
1223 Kokkos::DualView<
size_t*,
1225 const Kokkos::DualView<
const LO*,
1233 using device_type =
typename crs_matrix_type::device_type;
1234 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1235 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1238 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1239 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1240 "must be the same.");
1245 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1247 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1248 auto import_lids_d = importLIDs.view_device ();
1250 if (imports.need_sync_device()) {
1251 imports.sync_device ();
1253 auto imports_d = imports.view_device ();
1255 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1256 auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1257 typedef decltype (local_col_map) local_map_type;
1259 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1260 local_matrix_device_type,
1263 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1264 import_lids_d, combineMode);
1322template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1326 const Teuchos::ArrayView<const LocalOrdinal> &
importLIDs,
1327 const Teuchos::ArrayView<const char> &imports,
1332 const Teuchos::ArrayView<const LocalOrdinal>&
permuteToLIDs,
1335 using Kokkos::MemoryUnmanaged;
1337 typedef typename Node::device_type DT;
1338 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1349 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1355 auto local_matrix =
sourceMatrix.getLocalMatrixDevice ();
1357 using kokkos_device_type = Kokkos::Device<
typename Node::device_type::execution_space,
1358 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1364 "permute_from_lids");
1366 Kokkos::View<const char*, kokkos_device_type, void, void >
imports_d =
1368 imports.getRawPtr (),
1369 imports.size (),
true,
1376 "num_packets_per_lid");
1378 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1398template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1403 Kokkos::Device<
typename Node::device_type::execution_space,
1404 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1406 const Kokkos::View<
const char*,
1407 Kokkos::Device<
typename Node::device_type::execution_space,
1408 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1410 const Kokkos::View<
const size_t*,
1411 Kokkos::Device<
typename Node::device_type::execution_space,
1412 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1414 const size_t numSameIDs,
1416 Kokkos::Device<
typename Node::device_type::execution_space,
1417 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1420 Kokkos::Device<
typename Node::device_type::execution_space,
1421 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1425 Kokkos::View<size_t*,typename Node::device_type> &
crs_rowptr_d,
1426 Kokkos::View<GlobalOrdinal*,typename Node::device_type> &
crs_colind_d,
1428 const Teuchos::ArrayView<const int>&
SourcePids,
1429 Kokkos::View<int*,typename Node::device_type> &
TargetPids)
1431 using execution_space =
typename Node::execution_space;
1435 using Kokkos::deep_copy;
1437 using Teuchos::ArrayView;
1438 using Teuchos::outArg;
1439 using Teuchos::REDUCE_MAX;
1440 using Teuchos::reduceAll;
1442 typedef typename Node::device_type DT;
1445 typedef typename matrix_type::impl_scalar_type ST;
1447 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1448# ifdef HAVE_TPETRA_MMM_TIMINGS
1449 using Teuchos::TimeMonitor;
1450 Teuchos::RCP<TimeMonitor> tm;
1453 using Kokkos::MemoryUnmanaged;
1464 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1470 auto local_matrix =
sourceMatrix.getLocalMatrixDevice ();
1473# ifdef HAVE_TPETRA_MMM_TIMINGS
1474 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1477 UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1480# ifdef HAVE_TPETRA_MMM_TIMINGS
1484# ifdef HAVE_TPETRA_MMM_TIMINGS
1485 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
prefix + std::string(
"resize CRS pointers"))));
1490# ifdef HAVE_TPETRA_MMM_TIMINGS
1502 Kokkos::deep_copy(execution_space(),
TargetPids, -1);
1505 auto local_col_map =
sourceMatrix.getColMap()->getLocalMap();
1507# ifdef HAVE_TPETRA_MMM_TIMINGS
1508 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
prefix + std::string(
"create mirror views from inputs"))));
1517# ifdef HAVE_TPETRA_MMM_TIMINGS
1521 size_t bytes_per_value = 0;
1536 if (local_matrix.values.extent(0) > 0) {
1537 const ST&
val = local_matrix.values(0);
1543 Teuchos::reduceAll<int, size_t>(*(
sourceMatrix.getComm()),
1544 Teuchos::REDUCE_MAX,
1546 outArg(bytes_per_value));
1549# ifdef HAVE_TPETRA_MMM_TIMINGS
1550 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1552 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1558# ifdef HAVE_TPETRA_MMM_TIMINGS
1563# ifdef HAVE_TPETRA_MMM_TIMINGS
1564 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
prefix + std::string(
"copy back to host"))));
1567 Kokkos::parallel_for(
"setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,
TargetPids.size()),
KOKKOS_LAMBDA (
const size_t i) {
1573template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1578 Kokkos::Device<
typename Node::device_type::execution_space,
1579 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1581 const Kokkos::View<
const char*,
1582 Kokkos::Device<
typename Node::device_type::execution_space,
1583 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1585 const Kokkos::View<
const size_t*,
1586 Kokkos::Device<
typename Node::device_type::execution_space,
1587 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1589 const size_t numSameIDs,
1591 Kokkos::Device<
typename Node::device_type::execution_space,
1592 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1595 Kokkos::Device<
typename Node::device_type::execution_space,
1596 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1601 Teuchos::ArrayRCP<GlobalOrdinal>&
CRS_colind,
1602 Teuchos::ArrayRCP<Scalar>&
CRS_vals,
1603 const Teuchos::ArrayView<const int>&
SourcePids,
1606 using execution_space =
typename Node::execution_space;
1610 using Kokkos::deep_copy;
1612 using Teuchos::ArrayView;
1613 using Teuchos::outArg;
1614 using Teuchos::REDUCE_MAX;
1615 using Teuchos::reduceAll;
1617 typedef typename Node::device_type DT;
1620 typedef typename matrix_type::impl_scalar_type ST;
1622 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1623# ifdef HAVE_TPETRA_MMM_TIMINGS
1624 using Teuchos::TimeMonitor;
1625 Teuchos::RCP<TimeMonitor> tm;
1628 using Kokkos::MemoryUnmanaged;
1639 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1645 auto local_matrix =
sourceMatrix.getLocalMatrixDevice ();
1648# ifdef HAVE_TPETRA_MMM_TIMINGS
1649 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1652 UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1655# ifdef HAVE_TPETRA_MMM_TIMINGS
1659# ifdef HAVE_TPETRA_MMM_TIMINGS
1660 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
prefix + std::string(
"resize CRS pointers"))));
1666# ifdef HAVE_TPETRA_MMM_TIMINGS
1682 auto local_col_map =
sourceMatrix.getColMap()->getLocalMap();
1684# ifdef HAVE_TPETRA_MMM_TIMINGS
1685 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
prefix + std::string(
"create mirror views from inputs"))));
1697#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1698 static_assert (! std::is_same<
1699 typename std::remove_const<
1700 typename std::decay<
1704 std::complex<double> >::value,
1705 "CRS_vals::value_type is std::complex<double>; this should never happen"
1706 ", since std::complex does not work in Kokkos::View objects.");
1713#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1714 static_assert (! std::is_same<
1715 typename decltype (
crs_vals_d)::non_const_value_type,
1716 std::complex<double> >::value,
1717 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1718 "never happen, since std::complex does not work in Kokkos::View objects.");
1729# ifdef HAVE_TPETRA_MMM_TIMINGS
1733 size_t bytes_per_value = 0;
1748 if (local_matrix.values.extent(0) > 0) {
1749 const ST&
val = local_matrix.values(0);
1755 Teuchos::reduceAll<int, size_t>(*(
sourceMatrix.getComm()),
1756 Teuchos::REDUCE_MAX,
1758 outArg(bytes_per_value));
1761#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1762 static_assert (! std::is_same<
1763 typename decltype (
crs_vals_d)::non_const_value_type,
1764 std::complex<double> >::value,
1765 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1766 "never happen, since std::complex does not work in Kokkos::View objects.");
1769# ifdef HAVE_TPETRA_MMM_TIMINGS
1770 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1772 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1778# ifdef HAVE_TPETRA_MMM_TIMINGS
1783# ifdef HAVE_TPETRA_MMM_TIMINGS
1784 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
prefix + std::string(
"copy back to host"))));
1812#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1814 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1815 const CrsMatrix<ST, LO, GO, NT>&, \
1816 const Teuchos::ArrayView<const char>&, \
1817 const Teuchos::ArrayView<const size_t>&, \
1818 const Teuchos::ArrayView<const LO>&, \
1822 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1823 const CrsMatrix<ST, LO, GO, NT> &, \
1824 const Teuchos::ArrayView<const LO> &, \
1825 const Teuchos::ArrayView<const char> &, \
1826 const Teuchos::ArrayView<const size_t>&, \
1830 const Teuchos::ArrayView<const LO>&, \
1831 const Teuchos::ArrayView<const LO>&); \
1833 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1834 const CrsMatrix<ST, LO, GO, NT>&, \
1835 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1836 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1837 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1839 const CombineMode); \
1841 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1842 const CrsMatrix<ST, LO, GO, NT> &, \
1843 const Kokkos::View<LO const *, \
1844 Kokkos::Device<typename NT::device_type::execution_space, \
1845 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1847 const Kokkos::View<const char*, \
1848 Kokkos::Device<typename NT::device_type::execution_space, \
1849 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1851 const Kokkos::View<const size_t*, \
1852 Kokkos::Device<typename NT::device_type::execution_space, \
1853 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1856 const Kokkos::View<LO const *, \
1857 Kokkos::Device<typename NT::device_type::execution_space, \
1858 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1860 const Kokkos::View<LO const *, \
1861 Kokkos::Device<typename NT::device_type::execution_space, \
1862 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1866 Kokkos::View<size_t*,typename NT::device_type>&, \
1867 Kokkos::View<GO*,typename NT::device_type>&, \
1868 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1869 const Teuchos::ArrayView<const int>&, \
1870 Kokkos::View<int*,typename NT::device_type>&); \
1872 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1873 const CrsMatrix<ST, LO, GO, NT> &, \
1874 const Kokkos::View<LO const *, \
1875 Kokkos::Device<typename NT::device_type::execution_space, \
1876 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1878 const Kokkos::View<const char*, \
1879 Kokkos::Device<typename NT::device_type::execution_space, \
1880 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1882 const Kokkos::View<const size_t*, \
1883 Kokkos::Device<typename NT::device_type::execution_space, \
1884 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1887 const Kokkos::View<LO const *, \
1888 Kokkos::Device<typename NT::device_type::execution_space, \
1889 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1891 const Kokkos::View<LO const *, \
1892 Kokkos::Device<typename NT::device_type::execution_space, \
1893 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1897 Teuchos::ArrayRCP<size_t>&, \
1898 Teuchos::ArrayRCP<GO>&, \
1899 Teuchos::ArrayRCP<ST>&, \
1900 const Teuchos::ArrayView<const int>&, \
1901 Teuchos::Array<int>&);
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
Struct that holds views of the contents of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Implementation details of Tpetra.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Unpacks and combines a single row of the CrsMatrix.
int error() const
Host function for getting the error.