10#ifndef TPETRA_DETAILS_RESIDUAL_HPP
11#define TPETRA_DETAILS_RESIDUAL_HPP
13#include "TpetraCore_config.h"
14#include "Tpetra_CrsMatrix.hpp"
15#include "Tpetra_LocalCrsMatrixOperator.hpp"
16#include "Tpetra_MultiVector.hpp"
17#include "Teuchos_RCP.hpp"
18#include "Teuchos_ScalarTraits.hpp"
21#include "KokkosSparse_spmv_impl.hpp"
39template<
class AMatrix,
class MV,
class ConstMV,
class Offsets,
bool is_MV,
bool restrictedMode,
bool skipOffRank>
42 using execution_space =
typename AMatrix::execution_space;
43 using LO =
typename AMatrix::non_const_ordinal_type;
44 using value_type =
typename AMatrix::non_const_value_type;
45 using team_policy =
typename Kokkos::TeamPolicy<execution_space>;
46 using team_member =
typename team_policy::member_type;
47 using ATV = Kokkos::ArithTraits<value_type>;
78 Kokkos::parallel_for(Kokkos::TeamThreadRange (
dev, 0, rows_per_team),[&] (
const LO&
loop) {
79 const LO
lclRow =
static_cast<LO
> (
dev.league_rank ()) * rows_per_team +
loop;
81 if (
lclRow >= A_lcl.numRows ()) {
87 value_type
A_x = ATV::zero ();
102 const size_t start = A_lcl.graph.row_map(
lclRow);
103 const size_t end = A_lcl.graph.row_map(
lclRow+1);
105 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (
dev, start,
end), [&] (
const LO
iEntry, value_type&
lsum) {
115 Kokkos::single(Kokkos::PerThread(
dev),[&] () {
121 const LO
numVectors =
static_cast<LO
>(X_colmap_lcl.extent(1));
125 value_type
A_x = ATV::zero ();
139 const size_t start = A_lcl.graph.row_map(
lclRow);
140 const size_t end = A_lcl.graph.row_map(
lclRow+1);
142 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (
dev, start,
end), [&] (
const LO
iEntry, value_type&
lsum) {
152 Kokkos::single(Kokkos::PerThread(
dev),[&] () {
164template<
class AMatrix,
class MV,
class ConstMV,
class Offsets,
bool is_MV>
167 using execution_space =
typename AMatrix::execution_space;
168 using LO =
typename AMatrix::non_const_ordinal_type;
169 using value_type =
typename AMatrix::non_const_value_type;
170 using team_policy =
typename Kokkos::TeamPolicy<execution_space>;
171 using team_member =
typename team_policy::member_type;
172 using ATV = Kokkos::ArithTraits<value_type>;
197 Kokkos::parallel_for(Kokkos::TeamThreadRange (
dev, 0, rows_per_team),[&] (
const LO&
loop) {
198 const LO
lclRow =
static_cast<LO
> (
dev.league_rank ()) * rows_per_team +
loop;
200 if (
lclRow >= A_lcl.numRows ()) {
205 const size_t end = A_lcl.graph.row_map(
lclRow+1);
209 value_type
A_x = ATV::zero ();
217 Kokkos::single(Kokkos::PerThread(
dev),[&] () {
223 const LO
numVectors =
static_cast<LO
>(X_colmap_lcl.extent(1));
227 value_type
A_x = ATV::zero ();
235 Kokkos::single(Kokkos::PerThread(
dev),[&] () {
245template<
class SC,
class LO,
class GO,
class NO>
250 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
253 using Teuchos::NO_TRANS;
259 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
261 local_matrix_device_type A_lcl =
A.getLocalMatrixDevice ();
270 (
X_colmap.getNumVectors () !=
R.getNumVectors (), std::runtime_error,
271 "X.getNumVectors() = " <<
X_colmap.getNumVectors () <<
" != "
272 "R.getNumVectors() = " <<
R.getNumVectors () <<
".");
275 A.getColMap ()->getLocalNumElements (), std::runtime_error,
276 "X has the wrong number of local rows. "
277 "X.getLocalLength() = " <<
X_colmap.getLocalLength () <<
" != "
278 "A.getColMap()->getLocalNumElements() = " <<
279 A.getColMap ()->getLocalNumElements () <<
".");
281 (
R.getLocalLength () !=
282 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
283 "R has the wrong number of local rows. "
284 "R.getLocalLength() = " <<
R.getLocalLength () <<
" != "
285 "A.getRowMap()->getLocalNumElements() = " <<
286 A.getRowMap ()->getLocalNumElements () <<
".");
288 (
B.getLocalLength () !=
289 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
290 "B has the wrong number of local rows. "
291 "B.getLocalLength() = " <<
B.getLocalLength () <<
" != "
292 "A.getRowMap()->getLocalNumElements() = " <<
293 A.getRowMap ()->getLocalNumElements () <<
".");
296 (!
A.isFillComplete (), std::runtime_error,
"The matrix A is not "
297 "fill complete. You must call fillComplete() (possibly with "
298 "domain and range Map arguments) without an intervening "
299 "resumeFill() call before you may call this method.");
301 (!
X_colmap.isConstantStride () || !
R.isConstantStride () || !
B.isConstantStride (),
302 std::runtime_error,
"X, Y and B must be constant stride.");
306 ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () !=
nullptr) ||
307 (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () !=
nullptr),
308 std::runtime_error,
"X, Y and R may not alias one another.");
312 if (!fusedResidual) {
313 SC one = Teuchos::ScalarTraits<SC>::one();
315 SC zero = Teuchos::ScalarTraits<SC>::zero();
323 if (A_lcl.numRows() == 0) {
327 int64_t numLocalRows = A_lcl.numRows ();
328 int64_t myNnz = A_lcl.nnz();
331 int vector_length = -1;
332 int64_t rows_per_thread = -1;
335 using policy_type =
typename Kokkos::TeamPolicy<execution_space>;
337 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
338 int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
340 policy_type policy (1, 1);
342 policy = policy_type (worksets, Kokkos::AUTO, vector_length);
345 policy = policy_type (worksets, team_size, vector_length);
348 bool is_vector = (X_colmap_lcl.extent(1) == 1);
352 if (X_domainmap ==
nullptr) {
354 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,false,false>;
355 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
356 Kokkos::parallel_for(
"residual-vector",policy,func);
361 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
362 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,false>;
363 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
364 Kokkos::parallel_for(
"residual-vector",policy,func);
370 if (X_domainmap ==
nullptr) {
372 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,false,false>;
373 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
374 Kokkos::parallel_for(
"residual-multivector",policy,func);
379 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
380 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,false>;
381 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
382 Kokkos::parallel_for(
"residual-multivector",policy,func);
389template<
class SC,
class LO,
class GO,
class NO>
390void localResidualWithCommCompOverlap(
const CrsMatrix<SC,LO,GO,NO> & A,
391 MultiVector<SC,LO,GO,NO> & X_colmap,
392 const MultiVector<SC,LO,GO,NO> & B,
393 MultiVector<SC,LO,GO,NO> & R,
394 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
395 const MultiVector<SC,LO,GO,NO> & X_domainmap) {
397 using Teuchos::NO_TRANS;
401 ProfilingRegion regionLocalApply (
"Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
404 using local_view_device_type =
typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
405 using const_local_view_device_type =
typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
406 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
408 local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
409 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
410 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
411 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
412 const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);;
416 TEUCHOS_TEST_FOR_EXCEPTION
417 (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
418 "X.getNumVectors() = " << X_colmap.getNumVectors () <<
" != "
419 "R.getNumVectors() = " << R.getNumVectors () <<
".");
420 TEUCHOS_TEST_FOR_EXCEPTION
421 (X_colmap.getLocalLength () !=
422 A.getColMap ()->getLocalNumElements (), std::runtime_error,
423 "X has the wrong number of local rows. "
424 "X.getLocalLength() = " << X_colmap.getLocalLength () <<
" != "
425 "A.getColMap()->getLocalNumElements() = " <<
426 A.getColMap ()->getLocalNumElements () <<
".");
427 TEUCHOS_TEST_FOR_EXCEPTION
428 (R.getLocalLength () !=
429 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
430 "R has the wrong number of local rows. "
431 "R.getLocalLength() = " << R.getLocalLength () <<
" != "
432 "A.getRowMap()->getLocalNumElements() = " <<
433 A.getRowMap ()->getLocalNumElements () <<
".");
434 TEUCHOS_TEST_FOR_EXCEPTION
435 (B.getLocalLength () !=
436 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
437 "B has the wrong number of local rows. "
438 "B.getLocalLength() = " << B.getLocalLength () <<
" != "
439 "A.getRowMap()->getLocalNumElements() = " <<
440 A.getRowMap ()->getLocalNumElements () <<
".");
442 TEUCHOS_TEST_FOR_EXCEPTION
443 (! A.isFillComplete (), std::runtime_error,
"The matrix A is not "
444 "fill complete. You must call fillComplete() (possibly with "
445 "domain and range Map arguments) without an intervening "
446 "resumeFill() call before you may call this method.");
447 TEUCHOS_TEST_FOR_EXCEPTION
448 (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
449 std::runtime_error,
"X, Y and B must be constant stride.");
452 TEUCHOS_TEST_FOR_EXCEPTION
453 ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () !=
nullptr) ||
454 (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () !=
nullptr),
455 std::runtime_error,
"X, Y and R may not alias one another.");
458 if (A_lcl.numRows() == 0) {
462 int64_t numLocalRows = A_lcl.numRows ();
463 int64_t myNnz = A_lcl.nnz();
466 int vector_length = -1;
467 int64_t rows_per_thread = -1;
470 using policy_type =
typename Kokkos::TeamPolicy<execution_space>;
472 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
473 int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
475 policy_type policy (1, 1);
477 policy = policy_type (worksets, Kokkos::AUTO, vector_length);
480 policy = policy_type (worksets, team_size, vector_length);
483 bool is_vector = (X_colmap_lcl.extent(1) == 1);
487 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,true>;
488 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
489 Kokkos::parallel_for(
"residual-vector",policy,func);
491 RCP<const import_type> importer = A.getGraph ()->getImporter ();
492 X_colmap.endImport (X_domainmap, *importer,
INSERT,
true);
494 Kokkos::fence(
"Tpetra::localResidualWithCommCompOverlap-1");
496 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false>;
497 functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
498 Kokkos::parallel_for(
"residual-vector-offrank",policy,func2);
503 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,true>;
504 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
505 Kokkos::parallel_for(
"residual-multivector",policy,func);
507 RCP<const import_type> importer = A.getGraph ()->getImporter ();
508 X_colmap.endImport (X_domainmap, *importer,
INSERT,
true);
510 Kokkos::fence(
"Tpetra::localResidualWithCommCompOverlap-2");
512 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true>;
513 functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
514 Kokkos::parallel_for(
"residual-vector-offrank",policy,func2);
521template<
class SC,
class LO,
class GO,
class NO>
529 using Teuchos::rcp_const_cast;
530 using Teuchos::rcpFromRef;
535 if (overlapCommunicationAndComputation)
548 SC one = Teuchos::ScalarTraits<SC>::one(),
negone = -
one,
zero = Teuchos::ScalarTraits<SC>::zero();
559 using offset_type =
typename graph_type::offset_device_view_type;
563 (!
R_in.isDistributed () &&
A.getComm ()->getSize () != 1);
579 if (!
X_in.isConstantStride ()) {
609 (!
importer->getTargetMap()->isLocallyFitted(*
importer->getSourceMap()), std::runtime_error,
610 "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
619 A.getCrsGraph()->getLocalOffRankOffsets(offsets);
626 if (!
R_in.isConstantStride ()) {
641 if (!
B_in.isConstantStride ()) {
684 if (overlapCommunicationAndComputation) {
702 if (!
R_in.isConstantStride () ) {
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Struct that holds views of the contents of a CrsMatrix.
typename device_type::execution_space execution_space
The Kokkos execution space.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Import< LocalOrdinal, GlobalOrdinal, Node > import_type
The Import specialization suitable for this CrsMatrix specialization.
static bool fusedResidual()
Fusing SpMV and update in residual instead of using 2 kernel launches. Fusing kernels implies that no...
static bool debug()
Whether Tpetra is in debug mode.
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
One or more distributed dense vectors.
Implementation details of Tpetra.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ INSERT
Insert new values that don't currently exist.
Functor for computing the residual.
Functor for computing R -= A_offRank*X_colmap.