Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MATRIXMATRIX_DEF_HPP
11#define TPETRA_MATRIXMATRIX_DEF_HPP
13#include "KokkosSparse_Utils.hpp"
14#include "Tpetra_ConfigDefs.hpp"
16#include "Teuchos_VerboseObject.hpp"
17#include "Teuchos_Array.hpp"
18#include "Tpetra_Util.hpp"
19#include "Tpetra_CrsMatrix.hpp"
20#include "Tpetra_BlockCrsMatrix.hpp"
22#include "Tpetra_RowMatrixTransposer.hpp"
25#include "Tpetra_Details_makeColMap.hpp"
26#include "Tpetra_ConfigDefs.hpp"
27#include "Tpetra_Map.hpp"
28#include "Tpetra_Export.hpp"
31#include <algorithm>
32#include <type_traits>
33#include "Teuchos_FancyOStream.hpp"
34
35#include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
37
38#include "KokkosSparse_spgemm.hpp"
39#include "KokkosSparse_spadd.hpp"
40#include "Kokkos_Bitset.hpp"
41
43
51/*********************************************************************************************************/
52// Include the architecture-specific kernel partial specializations here
53// NOTE: This needs to be outside all namespaces
54#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
55#include "TpetraExt_MatrixMatrix_Cuda.hpp"
56#include "TpetraExt_MatrixMatrix_HIP.hpp"
57#include "TpetraExt_MatrixMatrix_SYCL.hpp"
58
59namespace Tpetra {
60
61namespace MatrixMatrix{
62
63//
64// This method forms the matrix-matrix product C = op(A) * op(B), where
65// op(A) == A if transposeA is false,
66// op(A) == A^T if transposeA is true,
67// and similarly for op(B).
68//
69template <class Scalar,
70 class LocalOrdinal,
71 class GlobalOrdinal,
72 class Node>
75 bool transposeA,
77 bool transposeB,
80 const std::string& label,
81 const Teuchos::RCP<Teuchos::ParameterList>& params)
82{
83 using Teuchos::null;
84 using Teuchos::RCP;
85 using Teuchos::rcp;
86 typedef Scalar SC;
87 typedef LocalOrdinal LO;
88 typedef GlobalOrdinal GO;
89 typedef Node NO;
90 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
91 typedef Import<LO,GO,NO> import_type;
93 typedef Map<LO,GO,NO> map_type;
95
96#ifdef HAVE_TPETRA_MMM_TIMINGS
97 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
98 using Teuchos::TimeMonitor;
99 //MM is used to time setup, and then multiply.
100
101 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
102#endif
103
104 const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
105
106 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
107
108 // The input matrices A and B must both be fillComplete.
109 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
110 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
111
112 // If transposeA is true, then Aprime will be the transpose of A
113 // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
114 // will just be a pointer to A.
116 // If transposeB is true, then Bprime will be the transpose of B
117 // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
118 // will just be a pointer to B.
120
121 // Is this a "clean" matrix?
122 //
123 // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
124 // locally nor globally indexed, then it was empty. I don't like
125 // this, because the most straightforward implementation presumes
126 // lazy allocation of indices. However, historical precedent
127 // demands that we keep around this predicate as a way to test
128 // whether the matrix is empty.
129 const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
130
131 bool use_optimized_ATB = false;
133 use_optimized_ATB = true;
134
135#ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
136 use_optimized_ATB = false;
137#endif
138
139 using Teuchos::ParameterList;
141 transposeParams->set ("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
142
145 Aprime = transposer.createTranspose (transposeParams);
146 }
147 else {
149 }
150
151 if (transposeB) {
153 Bprime = transposer.createTranspose (transposeParams);
154 }
155 else {
157 }
158
159 // Check size compatibility
160 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
161 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
162 global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
163 global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
164 global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
165 global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
166 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
167 prefix << "ERROR, inner dimensions of op(A) and op(B) "
168 "must match for matrix-matrix product. op(A) is "
169 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
170
171 // The result matrix C must at least have a row-map that reflects the correct
172 // row-size. Don't check the number of columns because rectangular matrices
173 // which were constructed with only one map can still end up having the
174 // correct capacity and dimensions when filled.
175 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
176 prefix << "ERROR, dimensions of result C must "
177 "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
178 << " rows, should have at least " << Aouter << std::endl);
179
180 // It doesn't matter whether C is already Filled or not. If it is already
181 // Filled, it must have space allocated for the positions that will be
182 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
183 // we'll error out later when trying to store result values.
184
185 // CGB: However, matrix must be in active-fill
186 if (!C.isFillActive()) C.resumeFill();
187
188 // We're going to need to import remotely-owned sections of A and/or B if
189 // more than one processor is performing this run, depending on the scenario.
190 int numProcs = A.getComm()->getSize();
191
192 // Declare a couple of structs that will be used to hold views of the data
193 // of A and B, to be used for fast access during the matrix-multiplication.
196
199
200#ifdef HAVE_TPETRA_MMM_TIMINGS
201 {
202 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
203#endif
204
205 // Now import any needed remote rows and populate the Aview struct
206 // NOTE: We assert that an import isn't needed --- since we do the transpose
207 // above to handle that.
208 if (!use_optimized_ATB) {
210 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
211 }
212
213 // We will also need local access to all rows of B that correspond to the
214 // column-map of op(A).
215 if (numProcs > 1)
216 targetMap_B = Aprime->getColMap();
217
218 // Import any needed remote rows and populate the Bview struct.
220 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
221
222#ifdef HAVE_TPETRA_MMM_TIMINGS
223 } //stop MM_importExtract here
224 //stop the setup timer, and start the multiply timer
225 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
226#endif
227
228 // Call the appropriate method to perform the actual multiplication.
229 if (use_optimized_ATB) {
230 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
231
232 } else if (call_FillComplete_on_result && newFlag) {
233 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
234
235 } else if (call_FillComplete_on_result) {
236 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
237
238 } else {
239 // mfh 27 Sep 2016: Is this the "slow" case? This
240 // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
241 // thread-parallel inserts, but that may take some effort.
243
244 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
245 }
246
247#ifdef HAVE_TPETRA_MMM_TIMINGS
248 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
249#endif
250 if (call_FillComplete_on_result && !C.isFillComplete()) {
251 // We'll call FillComplete on the C matrix before we exit, and give it a
252 // domain-map and a range-map.
253 // The domain-map will be the domain-map of B, unless
254 // op(B)==transpose(B), in which case the range-map of B will be used.
255 // The range-map will be the range-map of A, unless op(A)==transpose(A),
256 // in which case the domain-map of A will be used.
257 C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
258 }
259}
260
261//
262// This method forms the matrix-matrix product C = op(A) * op(B), where
263// op(A) == A and similarly for op(B). op(A) = A^T is not yet implemented.
264//
265template <class Scalar,
266 class LocalOrdinal,
267 class GlobalOrdinal,
268 class Node>
271 bool transposeA,
273 bool transposeB,
275 const std::string& label)
276{
277 using Teuchos::null;
278 using Teuchos::RCP;
279 using Teuchos::rcp;
280 typedef Scalar SC;
281 typedef LocalOrdinal LO;
282 typedef GlobalOrdinal GO;
283 typedef Node NO;
285 typedef Map<LO,GO,NO> map_type;
286 typedef Import<LO,GO,NO> import_type;
287
288 std::string prefix = std::string("TpetraExt ") + label + std::string(": ");
289
290 TEUCHOS_TEST_FOR_EXCEPTION(transposeA==true, std::runtime_error, prefix << "Matrix A cannot be transposed.");
291 TEUCHOS_TEST_FOR_EXCEPTION(transposeB==true, std::runtime_error, prefix << "Matrix B cannot be transposed.");
292
293 // Check size compatibility
294 global_size_t numACols = A->getGlobalNumCols();
295 global_size_t numBCols = B->getGlobalNumCols();
296 global_size_t numARows = A->getGlobalNumRows();
297 global_size_t numBRows = B->getGlobalNumRows();
298
303 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
304 prefix << "ERROR, inner dimensions of op(A) and op(B) "
305 "must match for matrix-matrix product. op(A) is "
306 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
307
308 // We're going to need to import remotely-owned sections of A and/or B if
309 // more than one processor is performing this run, depending on the scenario.
310 int numProcs = A->getComm()->getSize();
311
312 const LO blocksize = A->getBlockSize();
313 TEUCHOS_TEST_FOR_EXCEPTION(blocksize != B->getBlockSize(), std::runtime_error,
314 prefix << "ERROR, Blocksizes do not match. A.blocksize = " <<
315 blocksize << ", B.blocksize = " << B->getBlockSize() );
316
317 // Declare a couple of structs that will be used to hold views of the data
318 // of A and B, to be used for fast access during the matrix-multiplication.
321
322 RCP<const map_type> targetMap_A = A->getRowMap();
323 RCP<const map_type> targetMap_B = B->getRowMap();
324
325 // Populate the Aview struct. No remotes are needed.
327 MMdetails::import_and_extract_views(*A, targetMap_A, Aview, dummyImporter, true);
328
329 // We will also need local access to all rows of B that correspond to the
330 // column-map of op(A).
331 if (numProcs > 1)
332 targetMap_B = A->getColMap();
333
334 // Import any needed remote rows and populate the Bview struct.
335 MMdetails::import_and_extract_views(*B, targetMap_B, Bview, A->getGraph()->getImporter(),
336 A->getGraph()->getImporter().is_null());
337
338 // Call the appropriate method to perform the actual multiplication.
339 MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
340}
341
342template <class Scalar,
343 class LocalOrdinal,
344 class GlobalOrdinal,
345 class Node>
352 const std::string& label,
353 const Teuchos::RCP<Teuchos::ParameterList>& params)
354{
355 using Teuchos::RCP;
356 typedef Scalar SC;
357 typedef LocalOrdinal LO;
358 typedef GlobalOrdinal GO;
359 typedef Node NO;
360 typedef Import<LO,GO,NO> import_type;
362 typedef Map<LO,GO,NO> map_type;
363 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
364
365#ifdef HAVE_TPETRA_MMM_TIMINGS
366 std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
367 using Teuchos::TimeMonitor;
368 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
369#endif
370
371 const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
372
373 // A and B should already be Filled.
374 // Should we go ahead and call FillComplete() on them if necessary or error
375 // out? For now, we choose to error out.
376 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
377 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
378
381
382 // Now check size compatibility
383 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
384 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
385 global_size_t Aouter = A.getGlobalNumRows();
388 global_size_t Binner = B.getGlobalNumRows();
389 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
390 prefix << "ERROR, inner dimensions of op(A) and op(B) "
391 "must match for matrix-matrix product. op(A) is "
392 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
393
394 // The result matrix C must at least have a row-map that reflects the correct
395 // row-size. Don't check the number of columns because rectangular matrices
396 // which were constructed with only one map can still end up having the
397 // correct capacity and dimensions when filled.
398 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
399 prefix << "ERROR, dimensions of result C must "
400 "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
401 << " rows, should have at least "<< Aouter << std::endl);
402
403 // It doesn't matter whether C is already Filled or not. If it is already
404 // Filled, it must have space allocated for the positions that will be
405 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
406 // we'll error out later when trying to store result values.
407
408 // CGB: However, matrix must be in active-fill
409 TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
410
411 // We're going to need to import remotely-owned sections of A and/or B if
412 // more than one processor is performing this run, depending on the scenario.
413 int numProcs = A.getComm()->getSize();
414
415 // Declare a couple of structs that will be used to hold views of the data of
416 // A and B, to be used for fast access during the matrix-multiplication.
419
422
423#ifdef HAVE_TPETRA_MMM_TIMINGS
424 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
425 {
426#endif
427
428 // Enable globalConstants by default
429 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
430 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
431 if(!params.is_null()) {
432 importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
434 auto slist = params->sublist("matrixmatrix: kernel params",false);
436 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
438 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
439 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
440 auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
441 ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
442 ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
443 ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
444 }
445
446 //Now import any needed remote rows and populate the Aview struct.
448 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
449
450 // We will also need local access to all rows of B that correspond to the
451 // column-map of op(A).
452 if (numProcs > 1)
453 targetMap_B = Aprime->getColMap();
454
455 // Now import any needed remote rows and populate the Bview struct.
456 // Enable globalConstants by default
457 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
458 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
459 if(!params.is_null()) {
460 importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
461
462 auto slist = params->sublist("matrixmatrix: kernel params",false);
464 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
465 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
466 auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
467 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
469 ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
470 ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
471 ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
472 }
473
474 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
475
476#ifdef HAVE_TPETRA_MMM_TIMINGS
477 }
478 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
479#endif
480
481 // Now call the appropriate method to perform the actual multiplication.
483
484 // Is this a "clean" matrix
485 bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
486
488 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
489
490 } else if (call_FillComplete_on_result) {
491 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
492
493 } else {
494 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
495 }
496
497 if(!params.is_null()) {
498 bool removeZeroEntries = params->get("remove zeros", false);
499 if (removeZeroEntries) {
500 typedef Teuchos::ScalarTraits<Scalar> STS;
501 typename STS::magnitudeType threshold = params->get("remove zeros threshold", STS::magnitude(STS::zero()));
503 }
504 }
505}
506
507
508template <class Scalar,
509 class LocalOrdinal,
510 class GlobalOrdinal,
511 class Node>
512void Add(
514 bool transposeA,
518{
519 using Teuchos::Array;
520 using Teuchos::RCP;
521 using Teuchos::null;
522 typedef Scalar SC;
523 typedef LocalOrdinal LO;
524 typedef GlobalOrdinal GO;
525 typedef Node NO;
526 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
528
529 const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
530
531 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
532 prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
533 "(Result matrix B is not required to be isFillComplete()).");
534 TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
535 prefix << "ERROR, input matrix B must not be fill complete!");
536 TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
537 prefix << "ERROR, input matrix B must not have static graph!");
538 TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
539 prefix << "ERROR, input matrix B must not be locally indexed!");
540
541 using Teuchos::ParameterList;
543 transposeParams->set ("sort", false);
544
546 if (transposeA) {
548 Aprime = transposer.createTranspose (transposeParams);
549 }
550 else {
552 }
553
554 size_t a_numEntries;
555 typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getLocalMaxNumRowEntries());
556 typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getLocalMaxNumRowEntries());
557 GO row;
558
559 if (scalarB != Teuchos::ScalarTraits<SC>::one())
560 B.scale(scalarB);
561
562 size_t numMyRows = B.getLocalNumRows();
563 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
564 for (LO i = 0; (size_t)i < numMyRows; ++i) {
565 row = B.getRowMap()->getGlobalElement(i);
566 Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
567
568 if (scalarA != Teuchos::ScalarTraits<SC>::one()) {
569 for (size_t j = 0; j < a_numEntries; ++j)
570 a_vals[j] *= scalarA;
571 }
572 B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
573 }
574 }
575}
576
577template <class Scalar,
578 class LocalOrdinal,
579 class GlobalOrdinal,
580 class Node>
581Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
583 const bool transposeA,
585 const Scalar& beta,
586 const bool transposeB,
588 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
589 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
590 const Teuchos::RCP<Teuchos::ParameterList>& params)
591{
592 using Teuchos::RCP;
593 using Teuchos::rcpFromRef;
594 using Teuchos::rcp;
595 using Teuchos::ParameterList;
597 if(!params.is_null())
598 {
600 params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
601 std::invalid_argument,
602 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
603 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
604 params->set("Call fillComplete", true);
605 }
606 //If transposeB, must compute B's explicit transpose to
607 //get the correct row map for C.
609 if(transposeB)
610 {
612 Brcp = transposer.createTranspose();
613 }
614 //Check that A,B are fillComplete before getting B's column map
616 (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
617 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
618 RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
619 //this version of add() always fill completes the result, no matter what is in params on input
620 add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
621 return C;
622}
623
624//This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
625//but since the spadd() output is always packed there is no need for a separate
626//numRowEntries here.
627//
628template<class LO, class GO, class LOView, class GOView, class LocalMap>
629struct ConvertGlobalToLocalFunctor
630{
631 ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
632 : lids(lids_), gids(gids_), localColMap(localColMap_)
633 {}
634
635 KOKKOS_FUNCTION void operator() (const GO i) const
636 {
637 lids(i) = localColMap.getLocalElement(gids(i));
638 }
639
640 LOView lids;
641 const GOView gids;
642 const LocalMap localColMap;
643};
644
645template <class Scalar,
646 class LocalOrdinal,
647 class GlobalOrdinal,
648 class Node>
649void
651 const bool transposeA,
653 const Scalar& beta,
654 const bool transposeB,
657 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
658 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
659 const Teuchos::RCP<Teuchos::ParameterList>& params)
660{
661 using Teuchos::RCP;
662 using Teuchos::rcp;
663 using Teuchos::rcpFromRef;
664 using Teuchos::rcp_implicit_cast;
665 using Teuchos::rcp_dynamic_cast;
666 using Teuchos::TimeMonitor;
667 using SC = Scalar;
668 using LO = LocalOrdinal;
669 using GO = GlobalOrdinal;
670 using NO = Node;
671 using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
672 using crs_graph_type = CrsGraph<LO,GO,NO>;
673 using map_type = Map<LO,GO,NO>;
675 using import_type = Import<LO,GO,NO>;
676 using export_type = Export<LO,GO,NO>;
677 using exec_space = typename crs_graph_type::execution_space;
678 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
679 const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
680 constexpr bool debug = false;
681
682#ifdef HAVE_TPETRA_MMM_TIMINGS
683 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
684#endif
685
686 if (debug) {
687 std::ostringstream os;
688 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
689 << "TpetraExt::MatrixMatrix::add" << std::endl;
690 std::cerr << os.str ();
691 }
692
694 (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
695 prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
697 (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
698 prefix_mmm << "A and B must both be fill complete.");
699#ifdef HAVE_TPETRA_DEBUG
700 // The matrices don't have domain or range Maps unless they are fill complete.
701 if (A.isFillComplete () && B.isFillComplete ()) {
702 const bool domainMapsSame =
703 (! transposeA && ! transposeB &&
704 ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
705 (! transposeA && transposeB &&
706 ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
707 ( transposeA && ! transposeB &&
708 ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
709 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
710 prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
711
712 const bool rangeMapsSame =
713 (! transposeA && ! transposeB &&
714 ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
715 (! transposeA && transposeB &&
716 ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
717 ( transposeA && ! transposeB &&
718 ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
719 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
720 prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
721 }
722#endif // HAVE_TPETRA_DEBUG
723
724 using Teuchos::ParameterList;
725 // Form the explicit transpose of A if necessary.
727 if (transposeA) {
729 Aprime = transposer.createTranspose ();
730 }
731
732#ifdef HAVE_TPETRA_DEBUG
734 (Aprime.is_null (), std::logic_error,
735 prefix_mmm << "Failed to compute Op(A). "
736 "Please report this bug to the Tpetra developers.");
737#endif // HAVE_TPETRA_DEBUG
738
739 // Form the explicit transpose of B if necessary.
741 if (transposeB) {
742 if (debug) {
743 std::ostringstream os;
744 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
745 << "Form explicit xpose of B" << std::endl;
746 std::cerr << os.str ();
747 }
749 Bprime = transposer.createTranspose ();
750 }
751#ifdef HAVE_TPETRA_DEBUG
752 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
753 prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
755 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
756 prefix_mmm << "Aprime and Bprime must both be fill complete. "
757 "Please report this bug to the Tpetra developers.");
758#endif // HAVE_TPETRA_DEBUG
761 if(CDomainMap.is_null())
762 {
763 CDomainMap = Bprime->getDomainMap();
764 }
765 if(CRangeMap.is_null())
766 {
767 CRangeMap = Bprime->getRangeMap();
768 }
769 assert(!(CDomainMap.is_null()));
770 assert(!(CRangeMap.is_null()));
771 typedef typename AddKern::values_array values_array;
772 typedef typename AddKern::row_ptrs_array row_ptrs_array;
773 typedef typename AddKern::col_inds_array col_inds_array;
774 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
775 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
776 values_array vals;
777 row_ptrs_array rowptrs;
778 col_inds_array colinds;
779#ifdef HAVE_TPETRA_MMM_TIMINGS
780 MM = Teuchos::null;
781 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
782#endif
783 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
784 {
785 //import Aprime into Bprime's row map so the local matrices have same # of rows
786 auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
787 // cbl do not set
788 // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
789 // this import _may_ take the form of a transfer. In practice it would be unlikely,
790 // but the general case is not so forgiving.
791 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
792 }
793 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
795 RCP<const import_type> Cimport = Teuchos::null;
796 RCP<export_type> Cexport = Teuchos::null;
797 bool doFillComplete = true;
798 if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
799 {
800 doFillComplete = params->get<bool>("Call fillComplete");
801 }
802 auto Alocal = Aprime->getLocalMatrixDevice();
803 auto Blocal = Bprime->getLocalMatrixDevice();
804 LO numLocalRows = Alocal.numRows();
805 if(numLocalRows == 0)
806 {
807 //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
808 //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
809 //Handle this case now
810 //(without interfering with collective operations, since it's possible for
811 //some ranks to have 0 local rows and others not).
812 rowptrs = row_ptrs_array("C rowptrs", 0);
813 }
814 auto Acolmap = Aprime->getColMap();
815 auto Bcolmap = Bprime->getColMap();
816 if(!matchingColMaps)
817 {
818 using global_col_inds_array = typename AddKern::global_col_inds_array;
819#ifdef HAVE_TPETRA_MMM_TIMINGS
820 MM = Teuchos::null;
821 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
822#endif
823 //use kernel that converts col indices in both A and B to common domain map before adding
824 auto AlocalColmap = Acolmap->getLocalMap();
825 auto BlocalColmap = Bcolmap->getLocalMap();
826 global_col_inds_array globalColinds;
827 if (debug) {
828 std::ostringstream os;
829 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
830 << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
831 std::cerr << os.str ();
832 }
833 AddKern::convertToGlobalAndAdd(
836 if (debug) {
837 std::ostringstream os;
838 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
839 << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
840 std::cerr << os.str ();
841 }
842#ifdef HAVE_TPETRA_MMM_TIMINGS
843 MM = Teuchos::null;
844 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
845#endif
847 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
849 C.replaceColMap(CcolMap);
850 col_inds_array localColinds("C colinds", globalColinds.extent(0));
851 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
852 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
853 col_inds_array, global_col_inds_array,
854 typename map_type::local_map_type>
855 (localColinds, globalColinds, CcolMap->getLocalMap()));
856 KokkosSparse::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
857 C.setAllValues(rowptrs, localColinds, vals);
858 C.fillComplete(CDomainMap, CRangeMap, params);
859 if(!doFillComplete)
860 C.resumeFill();
861 }
862 else
863 {
864 //Aprime, Bprime and C all have the same column maps
865 auto Avals = Alocal.values;
866 auto Bvals = Blocal.values;
867 auto Arowptrs = Alocal.graph.row_map;
868 auto Browptrs = Blocal.graph.row_map;
869 auto Acolinds = Alocal.graph.entries;
870 auto Bcolinds = Blocal.graph.entries;
871 if(sorted)
872 {
873 //use sorted kernel
874#ifdef HAVE_TPETRA_MMM_TIMINGS
875 MM = Teuchos::null;
876 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
877#endif
878 if (debug) {
879 std::ostringstream os;
880 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
881 << "Call AddKern::addSorted(...)" << std::endl;
882 std::cerr << os.str ();
883 }
884#if KOKKOSKERNELS_VERSION >= 40299
885 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
886#else
887 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
888#endif
889 }
890 else
891 {
892 //use unsorted kernel
893#ifdef HAVE_TPETRA_MMM_TIMINGS
894 MM = Teuchos::null;
895 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
896#endif
897 if (debug) {
898 std::ostringstream os;
899 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
900 << "Call AddKern::addUnsorted(...)" << std::endl;
901 std::cerr << os.str ();
902 }
903 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
904 }
905 //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
907 C.replaceColMap(Ccolmap);
908 C.setAllValues(rowptrs, colinds, vals);
910 {
911 #ifdef HAVE_TPETRA_MMM_TIMINGS
912 MM = Teuchos::null;
913 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
914 #endif
915 if(!CDomainMap->isSameAs(*Ccolmap))
916 {
917 if (debug) {
918 std::ostringstream os;
919 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
920 << "Create Cimport" << std::endl;
921 std::cerr << os.str ();
922 }
923 Cimport = rcp(new import_type(CDomainMap, Ccolmap));
924 }
925 if(!C.getRowMap()->isSameAs(*CRangeMap))
926 {
927 if (debug) {
928 std::ostringstream os;
929 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
930 << "Create Cexport" << std::endl;
931 std::cerr << os.str ();
932 }
933 Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
934 }
935
936 if (debug) {
937 std::ostringstream os;
938 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
939 << "Call C->expertStaticFillComplete(...)" << std::endl;
940 std::cerr << os.str ();
941 }
942 C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
943 }
944 }
945}
946
947// This version of Add takes C as RCP&, so C may be null on input (in this case,
948// it is allocated and constructed in this function).
949template <class Scalar,
950 class LocalOrdinal,
951 class GlobalOrdinal,
952 class Node>
953void Add(
955 bool transposeA,
958 bool transposeB,
961{
962 using Teuchos::Array;
963 using Teuchos::ArrayRCP;
964 using Teuchos::ArrayView;
965 using Teuchos::RCP;
966 using Teuchos::rcp;
967 using Teuchos::rcp_dynamic_cast;
968 using Teuchos::rcpFromRef;
969 using Teuchos::tuple;
970 using std::endl;
971 // typedef typename ArrayView<const Scalar>::size_type size_type;
972 typedef Teuchos::ScalarTraits<Scalar> STS;
974 // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
975 // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
976 // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
979
980 std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
981
983 ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
984 prefix << "A and B must both be fill complete before calling this function.");
985
986 if(C.is_null()) {
987 TEUCHOS_TEST_FOR_EXCEPTION(!A.haveGlobalConstants(), std::logic_error,
988 prefix << "C is null (must be allocated), but A.haveGlobalConstants() is false. "
989 "Please report this bug to the Tpetra developers.");
990 TEUCHOS_TEST_FOR_EXCEPTION(!B.haveGlobalConstants(), std::logic_error,
991 prefix << "C is null (must be allocated), but B.haveGlobalConstants() is false. "
992 "Please report this bug to the Tpetra developers.");
993 }
994
995#ifdef HAVE_TPETRA_DEBUG
996 {
997 const bool domainMapsSame =
998 (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
999 (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
1000 (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
1001 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
1002 prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
1003
1004 const bool rangeMapsSame =
1005 (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
1006 (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
1007 (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
1008 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
1009 prefix << "The range Maps of Op(A) and Op(B) are not the same.");
1010 }
1011#endif // HAVE_TPETRA_DEBUG
1012
1013 using Teuchos::ParameterList;
1015 transposeParams->set ("sort", false);
1016
1017 // Form the explicit transpose of A if necessary.
1019 if (transposeA) {
1021 Aprime = theTransposer.createTranspose (transposeParams);
1022 }
1023 else {
1024 Aprime = rcpFromRef (A);
1025 }
1026
1027#ifdef HAVE_TPETRA_DEBUG
1028 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1029 prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
1030#endif // HAVE_TPETRA_DEBUG
1031
1032 // Form the explicit transpose of B if necessary.
1034 if (transposeB) {
1036 Bprime = theTransposer.createTranspose (transposeParams);
1037 }
1038 else {
1039 Bprime = rcpFromRef (B);
1040 }
1041
1042#ifdef HAVE_TPETRA_DEBUG
1043 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1044 prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
1045#endif // HAVE_TPETRA_DEBUG
1046
1047 bool CwasFillComplete = false;
1048
1049 // Allocate or zero the entries of the result matrix.
1050 if (! C.is_null ()) {
1051 CwasFillComplete = C->isFillComplete();
1053 C->resumeFill();
1054 C->setAllToScalar (STS::zero ());
1055 } else {
1056 // FIXME (mfh 08 May 2013) When I first looked at this method, I
1057 // noticed that C was being given the row Map of Aprime (the
1058 // possibly transposed version of A). Is this what we want?
1059
1060 // It is a precondition that Aprime and Bprime have the same domain and range maps.
1061 // However, they may have different row maps. In this case, it's difficult to
1062 // get a precise upper bound on the number of entries in each local row of C, so
1063 // just use the looser upper bound based on the max number of entries in any row of Aprime and Bprime.
1064 if(Aprime->getRowMap()->isSameAs(*Bprime->getRowMap())) {
1065 LocalOrdinal numLocalRows = Aprime->getLocalNumRows();
1067 for(LocalOrdinal i = 0; i < numLocalRows; i++) {
1068 CmaxEntriesPerRow[i] = Aprime->getNumEntriesInLocalRow(i) + Bprime->getNumEntriesInLocalRow(i);
1069 }
1070 C = rcp (new crs_matrix_type (Aprime->getRowMap (), CmaxEntriesPerRow()));
1071 }
1072 else {
1073 // Note: above we checked that Aprime and Bprime have global constants, so it's safe to ask for max entries per row.
1074 C = rcp (new crs_matrix_type (Aprime->getRowMap (), Aprime->getGlobalMaxNumRowEntries() + Bprime->getGlobalMaxNumRowEntries()));
1075 }
1076 }
1077
1078#ifdef HAVE_TPETRA_DEBUG
1079 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1080 prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1081 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1082 prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1083 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1084 prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1085#endif // HAVE_TPETRA_DEBUG
1086
1090
1091 // do a loop over each matrix to add: A reordering might be more efficient
1092 for (int k = 0; k < 2; ++k) {
1093 typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1094 typename crs_matrix_type::nonconst_values_host_view_type Values;
1095
1096 // Loop over each locally owned row of the current matrix (either
1097 // Aprime or Bprime), and sum its entries into the corresponding
1098 // row of C. This works regardless of whether Aprime or Bprime
1099 // has the same row Map as C, because both sumIntoGlobalValues and
1100 // insertGlobalValues allow summing resp. inserting into nonowned
1101 // rows of C.
1102#ifdef HAVE_TPETRA_DEBUG
1103 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1104 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1105#endif // HAVE_TPETRA_DEBUG
1106 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1107#ifdef HAVE_TPETRA_DEBUG
1108 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1109 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1110#endif // HAVE_TPETRA_DEBUG
1111
1112 const size_t localNumRows = Mat[k]->getLocalNumRows ();
1113 for (size_t i = 0; i < localNumRows; ++i) {
1114 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1115 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1116 if (numEntries > 0) {
1117 if(numEntries > Indices.extent(0)) {
1118 Kokkos::resize(Indices, numEntries);
1119 Kokkos::resize(Values, numEntries);
1120 }
1121 Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1122
1123 if (scalar[k] != STS::one ()) {
1124 for (size_t j = 0; j < numEntries; ++j) {
1125 Values[j] *= scalar[k];
1126 }
1127 }
1128
1129 if (CwasFillComplete) {
1130 size_t result = C->sumIntoGlobalValues (globalRow, numEntries,
1131 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1132 TEUCHOS_TEST_FOR_EXCEPTION(result != numEntries, std::logic_error,
1133 prefix << "sumIntoGlobalValues failed to add entries from A or B into C.");
1134 } else {
1135 C->insertGlobalValues (globalRow, numEntries,
1136 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1137 }
1138 }
1139 }
1140 }
1141 if(CwasFillComplete) {
1142 C->fillComplete(C->getDomainMap (),
1143 C->getRangeMap ());
1144 }
1145}
1146
1147// This version of Add takes C as const RCP&, so C must not be null on input. Otherwise, its behavior is identical
1148// to the above version where C is RCP&.
1149template <class Scalar,
1150 class LocalOrdinal,
1151 class GlobalOrdinal,
1152 class Node>
1153void Add(
1155 bool transposeA,
1158 bool transposeB,
1161{
1162 std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
1163
1164 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::invalid_argument,
1165 prefix << "C must not be null");
1166
1167 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > C_ = C;
1169}
1170
1171} //End namespace MatrixMatrix
1172
1173namespace MMdetails{
1174
1175/*********************************************************************************************************/
1177//template <class TransferType>
1178//void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1179// if (Transfer.is_null())
1180// return;
1181//
1182// const Distributor & Distor = Transfer->getDistributor();
1183// Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1184//
1185// size_t rows_send = Transfer->getNumExportIDs();
1186// size_t rows_recv = Transfer->getNumRemoteIDs();
1187//
1188// size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1189// size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1190// size_t num_send_neighbors = Distor.getNumSends();
1191// size_t num_recv_neighbors = Distor.getNumReceives();
1192// size_t round2_send, round2_recv;
1193// Distor.getLastDoStatistics(round2_send,round2_recv);
1194//
1195// int myPID = Comm->getRank();
1196// int NumProcs = Comm->getSize();
1197//
1198// // Processor by processor statistics
1199// // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1200// // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1201//
1202// // Global statistics
1203// size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1204// size_t gstats_min[8], gstats_max[8];
1205//
1206// double lstats_avg[8], gstats_avg[8];
1207// for(int i=0; i<8; i++)
1208// lstats_avg[i] = ((double)lstats[i])/NumProcs;
1209//
1210// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1211// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1212// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1213//
1214// if(!myPID) {
1215// printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1216// (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1217// (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1218// printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1219// (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1220// (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1221// }
1222//}
1223
1224// Kernel method for computing the local portion of C = A*B
1225template<class Scalar,
1226 class LocalOrdinal,
1227 class GlobalOrdinal,
1228 class Node>
1229void mult_AT_B_newmatrix(
1230 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1231 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1232 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1233 const std::string & label,
1234 const Teuchos::RCP<Teuchos::ParameterList>& params)
1235{
1236 using Teuchos::RCP;
1237 using Teuchos::rcp;
1238 typedef Scalar SC;
1239 typedef LocalOrdinal LO;
1240 typedef GlobalOrdinal GO;
1241 typedef Node NO;
1242 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1243 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1244
1245#ifdef HAVE_TPETRA_MMM_TIMINGS
1246 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1247 using Teuchos::TimeMonitor;
1248 RCP<TimeMonitor> MM = rcp (new TimeMonitor
1249 (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1250#endif
1251
1252 /*************************************************************/
1253 /* 1) Local Transpose of A */
1254 /*************************************************************/
1255 transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1256
1257 using Teuchos::ParameterList;
1258 RCP<ParameterList> transposeParams (new ParameterList);
1259 transposeParams->set ("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
1260 if(! params.is_null ()) {
1261 transposeParams->set ("compute global constants",
1262 params->get ("compute global constants: temporaries",
1263 false));
1264 }
1265 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1266 transposer.createTransposeLocal (transposeParams);
1267
1268 /*************************************************************/
1269 /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1270 /*************************************************************/
1271#ifdef HAVE_TPETRA_MMM_TIMINGS
1272 MM = Teuchos::null;
1273 MM = rcp (new TimeMonitor
1274 (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1275#endif
1276
1277 // Get views, asserting that no import is required to speed up computation
1278 crs_matrix_struct_type Aview;
1279 crs_matrix_struct_type Bview;
1280 RCP<const Import<LO, GO, NO> > dummyImporter;
1281
1282 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1283 RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1284 if (! params.is_null ()) {
1285 importParams1->set ("compute global constants",
1286 params->get ("compute global constants: temporaries",
1287 false));
1288 auto slist = params->sublist ("matrixmatrix: kernel params", false);
1289 bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1290 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1291 int mm_optimization_core_count =
1293 mm_optimization_core_count =
1294 slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1295 int mm_optimization_core_count2 =
1296 params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1297 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1298 mm_optimization_core_count = mm_optimization_core_count2;
1299 }
1300 auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1301 sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1302 sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1303 sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1304 }
1305
1306 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1307 Aview, dummyImporter, true,
1308 label, importParams1);
1309
1310 RCP<ParameterList> importParams2 (new ParameterList);
1311 if (! params.is_null ()) {
1312 importParams2->set ("compute global constants",
1313 params->get ("compute global constants: temporaries",
1314 false));
1315 auto slist = params->sublist ("matrixmatrix: kernel params", false);
1316 bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1317 bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1318 int mm_optimization_core_count =
1320 mm_optimization_core_count =
1321 slist.get ("MM_TAFC_OptimizationCoreCount",
1322 mm_optimization_core_count);
1323 int mm_optimization_core_count2 =
1324 params->get ("MM_TAFC_OptimizationCoreCount",
1325 mm_optimization_core_count);
1326 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1327 mm_optimization_core_count = mm_optimization_core_count2;
1328 }
1329 auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1330 sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1331 sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1332 sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1333 }
1334
1335 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1336 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1337 }
1338 else {
1339 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1340 }
1341
1342#ifdef HAVE_TPETRA_MMM_TIMINGS
1343 MM = Teuchos::null;
1344 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1345#endif
1346
1347 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1348
1349 // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1350 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1351 if (needs_final_export) {
1352 Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1353 }
1354 else {
1355 Ctemp = rcp (&C, false);
1356 }
1357
1358 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1359
1360 /*************************************************************/
1361 /* 4) exportAndFillComplete matrix */
1362 /*************************************************************/
1363#ifdef HAVE_TPETRA_MMM_TIMINGS
1364 MM = Teuchos::null;
1365 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1366#endif
1367
1368 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1369
1370 if (needs_final_export) {
1371 ParameterList labelList;
1372 labelList.set("Timer Label", label);
1373 if(!params.is_null()) {
1374 ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1375 ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1376 int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1377 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1378 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1379 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1380 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1381 bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1382 bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1383
1384 labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1385 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1386 labelList.set("compute global constants",params->get("compute global constants",true));
1387 labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1388 }
1389 Ctemp->exportAndFillComplete (Crcp,
1390 *Ctemp->getGraph ()->getExporter (),
1391 B.getDomainMap (),
1392 A.getDomainMap (),
1393 rcp (&labelList, false));
1394 }
1395#ifdef HAVE_TPETRA_MMM_STATISTICS
1396 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1397#endif
1398}
1399
1400/*********************************************************************************************************/
1401// Kernel method for computing the local portion of C = A*B
1402template<class Scalar,
1403 class LocalOrdinal,
1404 class GlobalOrdinal,
1405 class Node>
1406void mult_A_B(
1407 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1408 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1409 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1410 const std::string& /* label */,
1411 const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1412{
1413 using Teuchos::Array;
1414 using Teuchos::ArrayRCP;
1415 using Teuchos::ArrayView;
1416 using Teuchos::OrdinalTraits;
1417 using Teuchos::null;
1418
1419 typedef Teuchos::ScalarTraits<Scalar> STS;
1420 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1421 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1422 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1423
1424 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1425 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1426
1427 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getLocalElementList();
1428 ArrayView<const GlobalOrdinal> bcols_import = null;
1429 if (Bview.importColMap != null) {
1430 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1431 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1432
1433 bcols_import = Bview.importColMap->getLocalElementList();
1434 }
1435
1436 size_t C_numCols = C_lastCol - C_firstCol +
1437 OrdinalTraits<LocalOrdinal>::one();
1438 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1439 OrdinalTraits<LocalOrdinal>::one();
1440
1441 if (C_numCols_import > C_numCols)
1442 C_numCols = C_numCols_import;
1443
1444 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1445 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1446 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1447
1448 Array<Scalar> C_row_i = dwork;
1449 Array<GlobalOrdinal> C_cols = iwork;
1450 Array<size_t> c_index = iwork2;
1451 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1452 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1453
1454 size_t C_row_i_length, j, k, last_index;
1455
1456 // Run through all the hash table lookups once and for all
1457 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1458 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1459 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1460 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1461 // Maps are the same: Use local IDs as the hash
1462 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1463 Aview.colMap->getMaxLocalIndex(); i++)
1464 Acol2Brow[i]=i;
1465 }
1466 else {
1467 // Maps are not the same: Use the map's hash
1468 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1469 Aview.colMap->getMaxLocalIndex(); i++) {
1470 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1471 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1472 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1473 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1474 }
1475 }
1476
1477 // To form C = A*B we're going to execute this expression:
1478 //
1479 // C(i,j) = sum_k( A(i,k)*B(k,j) )
1480 //
1481 // Our goal, of course, is to navigate the data in A and B once, without
1482 // performing searches for column-indices, etc.
1483 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1484 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1485 auto Avals = Aview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1486 auto Browptr = Bview.origMatrix->getLocalRowPtrsHost();
1487 auto Bcolind = Bview.origMatrix->getLocalIndicesHost();
1488 auto Bvals = Bview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1489 decltype(Browptr) Irowptr;
1490 decltype(Bcolind) Icolind;
1491 decltype(Bvals) Ivals;
1492 if(!Bview.importMatrix.is_null()) {
1493 Irowptr = Bview.importMatrix->getLocalRowPtrsHost();
1494 Icolind = Bview.importMatrix->getLocalIndicesHost();
1495 Ivals = Bview.importMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1496 }
1497
1498 bool C_filled = C.isFillComplete();
1499
1500 for (size_t i = 0; i < C_numCols; i++)
1501 c_index[i] = OrdinalTraits<size_t>::invalid();
1502
1503 // Loop over the rows of A.
1504 size_t Arows = Aview.rowMap->getLocalNumElements();
1505 for(size_t i=0; i<Arows; ++i) {
1506
1507 // Only navigate the local portion of Aview... which is, thankfully, all of
1508 // A since this routine doesn't do transpose modes
1509 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1510
1511 // Loop across the i-th row of A and for each corresponding row in B, loop
1512 // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1513 // quantities C_row_i. In other words, as we stride across B(k,:) we're
1514 // calculating updates for row i of the result matrix C.
1515 C_row_i_length = OrdinalTraits<size_t>::zero();
1516
1517 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1518 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1519 const Scalar Aval = Avals[k];
1520 if (Aval == STS::zero())
1521 continue;
1522
1523 if (Ak == LO_INVALID)
1524 continue;
1525
1526 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1527 LocalOrdinal col = Bcolind[j];
1528 //assert(col >= 0 && col < C_numCols);
1529
1530 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1531 //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1532 // This has to be a += so insertGlobalValue goes out
1533 C_row_i[C_row_i_length] = Aval*Bvals[j];
1534 C_cols[C_row_i_length] = col;
1535 c_index[col] = C_row_i_length;
1536 C_row_i_length++;
1537
1538 } else {
1539 // static cast from impl_scalar_type to Scalar needed for complex
1540 C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Bvals[j]);
1541 }
1542 }
1543 }
1544
1545 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1546 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1547 C_cols[ii] = bcols[C_cols[ii]];
1548 combined_index[ii] = C_cols[ii];
1549 combined_values[ii] = C_row_i[ii];
1550 }
1551 last_index = C_row_i_length;
1552
1553 //
1554 //Now put the C_row_i values into C.
1555 //
1556 // We might have to revamp this later.
1557 C_row_i_length = OrdinalTraits<size_t>::zero();
1558
1559 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1560 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1561 const Scalar Aval = Avals[k];
1562 if (Aval == STS::zero())
1563 continue;
1564
1565 if (Ak!=LO_INVALID) continue;
1566
1567 Ak = Acol2Irow[Acolind[k]];
1568 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1569 LocalOrdinal col = Icolind[j];
1570 //assert(col >= 0 && col < C_numCols);
1571
1572 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1573 //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1574 // This has to be a += so insertGlobalValue goes out
1575 C_row_i[C_row_i_length] = Aval*Ivals[j];
1576 C_cols[C_row_i_length] = col;
1577 c_index[col] = C_row_i_length;
1578 C_row_i_length++;
1579
1580 } else {
1581 // This has to be a += so insertGlobalValue goes out
1582 // static cast from impl_scalar_type to Scalar needed for complex
1583 C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Ivals[j]);
1584 }
1585 }
1586 }
1587
1588 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1589 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1590 C_cols[ii] = bcols_import[C_cols[ii]];
1591 combined_index[last_index] = C_cols[ii];
1592 combined_values[last_index] = C_row_i[ii];
1593 last_index++;
1594 }
1595
1596 // Now put the C_row_i values into C.
1597 // We might have to revamp this later.
1598 C_filled ?
1599 C.sumIntoGlobalValues(
1600 global_row,
1601 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1602 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1603 :
1604 C.insertGlobalValues(
1605 global_row,
1606 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1607 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1608
1609 }
1610}
1611
1612/*********************************************************************************************************/
1613template<class Scalar,
1614 class LocalOrdinal,
1615 class GlobalOrdinal,
1616 class Node>
1617void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1618 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1619 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1620
1621 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1622 Mview.maxNumRowEntries = Mview.indices[0].size();
1623
1624 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1625 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1626 Mview.maxNumRowEntries = Mview.indices[i].size();
1627 }
1628}
1629
1630/*********************************************************************************************************/
1631template<class CrsMatrixType>
1632size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1633 // Follows the NZ estimate in ML's ml_matmatmult.c
1634 size_t Aest = 100, Best=100;
1635 if (A.getLocalNumEntries() >= A.getLocalNumRows())
1636 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
1637 if (B.getLocalNumEntries() >= B.getLocalNumRows())
1638 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
1639
1640 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1641 nnzperrow *= nnzperrow;
1642
1643 return (size_t)(A.getLocalNumRows()*nnzperrow*0.75 + 100);
1644}
1645
1646
1647/*********************************************************************************************************/
1648// Kernel method for computing the local portion of C = A*B for CrsMatrix
1649//
1650// mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1651// function, so this is probably the function we want to
1652// thread-parallelize.
1653template<class Scalar,
1654 class LocalOrdinal,
1655 class GlobalOrdinal,
1656 class Node>
1657void mult_A_B_newmatrix(
1658 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1659 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1660 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1661 const std::string& label,
1662 const Teuchos::RCP<Teuchos::ParameterList>& params)
1663{
1664 using Teuchos::Array;
1665 using Teuchos::ArrayRCP;
1666 using Teuchos::ArrayView;
1667 using Teuchos::RCP;
1668 using Teuchos::rcp;
1669
1670 // Tpetra typedefs
1671 typedef LocalOrdinal LO;
1672 typedef GlobalOrdinal GO;
1673 typedef Node NO;
1674 typedef Import<LO,GO,NO> import_type;
1675 typedef Map<LO,GO,NO> map_type;
1676
1677 // Kokkos typedefs
1678 typedef typename map_type::local_map_type local_map_type;
1680 typedef typename KCRS::StaticCrsGraphType graph_t;
1681 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1682 typedef typename NO::execution_space execution_space;
1683 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1684 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1685
1686#ifdef HAVE_TPETRA_MMM_TIMINGS
1687 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1688 using Teuchos::TimeMonitor;
1689 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1690#endif
1691 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1692
1693 // Build the final importer / column map, hash table lookups for C
1694 RCP<const import_type> Cimport;
1695 RCP<const map_type> Ccolmap;
1696 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1697 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1698 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1699 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1700 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1701 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1702 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1703 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1704
1705 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1706 // indices of B, to local column indices of C. (B and C have the
1707 // same number of columns.) The kernel uses this, instead of
1708 // copying the entire input matrix B and converting its column
1709 // indices to those of C.
1710 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1711
1712 if (Bview.importMatrix.is_null()) {
1713 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1714 Cimport = Bimport;
1715 Ccolmap = Bview.colMap;
1716 const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1717 // Bcol2Ccol is trivial
1718 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1719 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1720 KOKKOS_LAMBDA(const LO i) {
1721 Bcol2Ccol(i) = i;
1722 });
1723 }
1724 else {
1725 // mfh 27 Sep 2016: B has "remotes," so we need to build the
1726 // column Map of C, as well as C's Import object (from its domain
1727 // Map to its column Map). C's column Map is the union of the
1728 // column Maps of (the local part of) B, and the "remote" part of
1729 // B. Ditto for the Import. We have optimized this "setUnion"
1730 // operation on Import objects and Maps.
1731
1732 // Choose the right variant of setUnion
1733 if (!Bimport.is_null() && !Iimport.is_null()) {
1734 Cimport = Bimport->setUnion(*Iimport);
1735 }
1736 else if (!Bimport.is_null() && Iimport.is_null()) {
1737 Cimport = Bimport->setUnion();
1738 }
1739 else if (Bimport.is_null() && !Iimport.is_null()) {
1740 Cimport = Iimport->setUnion();
1741 }
1742 else {
1743 throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1744 }
1745 Ccolmap = Cimport->getTargetMap();
1746
1747 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1748 // in general. We should get rid of it in order to reduce
1749 // communication costs of sparse matrix-matrix multiply.
1750 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1751 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1752
1753 // NOTE: This is not efficient and should be folded into setUnion
1754 //
1755 // mfh 27 Sep 2016: What the above comment means, is that the
1756 // setUnion operation on Import objects could also compute these
1757 // local index - to - local index look-up tables.
1758 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1759 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1760 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1761 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1762 });
1763 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1764 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1765 });
1766
1767 }
1768
1769 // Replace the column map
1770 //
1771 // mfh 27 Sep 2016: We do this because C was originally created
1772 // without a column Map. Now we have its column Map.
1773 C.replaceColMap(Ccolmap);
1774
1775 // mfh 27 Sep 2016: Construct tables that map from local column
1776 // indices of A, to local row indices of either B_local (the locally
1777 // owned part of B), or B_remote (the "imported" remote part of B).
1778 //
1779 // For column index Aik in row i of A, if the corresponding row of B
1780 // exists in the local part of B ("orig") (which I'll call B_local),
1781 // then targetMapToOrigRow[Aik] is the local index of that row of B.
1782 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1783 //
1784 // For column index Aik in row i of A, if the corresponding row of B
1785 // exists in the remote part of B ("Import") (which I'll call
1786 // B_remote), then targetMapToImportRow[Aik] is the local index of
1787 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1788 // (a flag value).
1789
1790 // Run through all the hash table lookups once and for all
1791 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
1792 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
1793
1794 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1795 GO aidx = Acolmap_local.getGlobalElement(i);
1796 LO B_LID = Browmap_local.getLocalElement(aidx);
1797 if (B_LID != LO_INVALID) {
1798 targetMapToOrigRow(i) = B_LID;
1799 targetMapToImportRow(i) = LO_INVALID;
1800 } else {
1801 LO I_LID = Irowmap_local.getLocalElement(aidx);
1802 targetMapToOrigRow(i) = LO_INVALID;
1803 targetMapToImportRow(i) = I_LID;
1804
1805 }
1806 });
1807
1808 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1809 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1810 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1811
1812}
1813
1814/*********************************************************************************************************/
1815// Kernel method for computing the local portion of C = A*B for BlockCrsMatrix
1816template<class Scalar,
1817 class LocalOrdinal,
1818 class GlobalOrdinal,
1819 class Node>
1820void mult_A_B_newmatrix(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1821 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1822 Teuchos::RCP<BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > &C)
1823{
1824 using Teuchos::null;
1825 using Teuchos::Array;
1826 using Teuchos::ArrayRCP;
1827 using Teuchos::ArrayView;
1828 using Teuchos::RCP;
1829 using Teuchos::rcp;
1830
1831 // Tpetra typedefs
1832 typedef LocalOrdinal LO;
1833 typedef GlobalOrdinal GO;
1834 typedef Node NO;
1835 typedef Import<LO,GO,NO> import_type;
1836 typedef Map<LO,GO,NO> map_type;
1837 typedef BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> block_crs_matrix_type;
1838 typedef typename block_crs_matrix_type::crs_graph_type graph_t;
1839
1840 // Kokkos typedefs
1841 typedef typename map_type::local_map_type local_map_type;
1842 typedef typename block_crs_matrix_type::local_matrix_device_type KBSR;
1843 typedef typename KBSR::device_type device_t;
1844 typedef typename KBSR::StaticCrsGraphType static_graph_t;
1845 typedef typename static_graph_t::row_map_type::non_const_type lno_view_t;
1846 typedef typename static_graph_t::entries_type::non_const_type lno_nnz_view_t;
1847 typedef typename KBSR::values_type::non_const_type scalar_view_t;
1848 typedef typename NO::execution_space execution_space;
1849 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1850 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1851
1852 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1853
1854 // Build the final importer / column map, hash table lookups for C
1855 RCP<const import_type> Cimport;
1856 RCP<const map_type> Ccolmap;
1857 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1858 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1859 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1860 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1861 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1862 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1863 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1864 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1865
1866 // Bcol2Ccol is a table that maps from local column
1867 // indices of B, to local column indices of C. (B and C have the
1868 // same number of columns.) The kernel uses this, instead of
1869 // copying the entire input matrix B and converting its column
1870 // indices to those of C.
1871 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1872
1873 if (Bview.importMatrix.is_null()) {
1874 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1875 Cimport = Bimport;
1876 Ccolmap = Bview.colMap;
1877 const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1878 // Bcol2Ccol is trivial
1879 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1880 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1881 KOKKOS_LAMBDA(const LO i) {
1882 Bcol2Ccol(i) = i;
1883 });
1884 }
1885 else {
1886 // B has "remotes," so we need to build the
1887 // column Map of C, as well as C's Import object (from its domain
1888 // Map to its column Map). C's column Map is the union of the
1889 // column Maps of (the local part of) B, and the "remote" part of
1890 // B. Ditto for the Import. We have optimized this "setUnion"
1891 // operation on Import objects and Maps.
1892
1893 // Choose the right variant of setUnion
1894 if (!Bimport.is_null() && !Iimport.is_null()) {
1895 Cimport = Bimport->setUnion(*Iimport);
1896 }
1897 else if (!Bimport.is_null() && Iimport.is_null()) {
1898 Cimport = Bimport->setUnion();
1899 }
1900 else if (Bimport.is_null() && !Iimport.is_null()) {
1901 Cimport = Iimport->setUnion();
1902 }
1903 else {
1904 throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1905 }
1906 Ccolmap = Cimport->getTargetMap();
1907
1908 // NOTE: This is not efficient and should be folded into setUnion
1909 //
1910 // What the above comment means, is that the
1911 // setUnion operation on Import objects could also compute these
1912 // local index - to - local index look-up tables.
1913 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1914 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1915 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",
1916 range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),
1917 KOKKOS_LAMBDA(const LO i) {
1918 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1919 });
1920 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",
1921 range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),
1922 KOKKOS_LAMBDA(const LO i) {
1923 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1924 });
1925 }
1926
1927 // Construct tables that map from local column
1928 // indices of A, to local row indices of either B_local (the locally
1929 // owned part of B), or B_remote (the "imported" remote part of B).
1930 //
1931 // For column index Aik in row i of A, if the corresponding row of B
1932 // exists in the local part of B ("orig") (which I'll call B_local),
1933 // then targetMapToOrigRow[Aik] is the local index of that row of B.
1934 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1935 //
1936 // For column index Aik in row i of A, if the corresponding row of B
1937 // exists in the remote part of B ("Import") (which I'll call
1938 // B_remote), then targetMapToImportRow[Aik] is the local index of
1939 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1940 // (a flag value).
1941
1942 // Run through all the hash table lookups once and for all
1943 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),
1944 Aview.colMap->getLocalNumElements());
1945 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),
1946 Aview.colMap->getLocalNumElements());
1947
1948 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",
1949 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),
1950 KOKKOS_LAMBDA(const LO i) {
1951 GO aidx = Acolmap_local.getGlobalElement(i);
1952 LO B_LID = Browmap_local.getLocalElement(aidx);
1953 if (B_LID != LO_INVALID) {
1954 targetMapToOrigRow(i) = B_LID;
1955 targetMapToImportRow(i) = LO_INVALID;
1956 } else {
1957 LO I_LID = Irowmap_local.getLocalElement(aidx);
1958 targetMapToOrigRow(i) = LO_INVALID;
1959 targetMapToImportRow(i) = I_LID;
1960 }
1961 });
1962
1963 // Create the KernelHandle
1964 using KernelHandle =
1965 KokkosKernels::Experimental::KokkosKernelsHandle<typename lno_view_t::const_value_type,
1966 typename lno_nnz_view_t::const_value_type,
1967 typename scalar_view_t::const_value_type,
1968 typename device_t::execution_space,
1969 typename device_t::memory_space,
1970 typename device_t::memory_space>;
1971 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
1972 std::string myalg("SPGEMM_KK_MEMORY");
1973 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
1974
1975 KernelHandle kh;
1976 kh.create_spgemm_handle(alg_enum);
1977 kh.set_team_work_size(team_work_size);
1978
1979 // Get KokkosSparse::BsrMatrix for A and Bmerged (B and BImport)
1980 const KBSR Amat = Aview.origMatrix->getLocalMatrixDevice();
1981 const KBSR Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,
1982 targetMapToOrigRow,targetMapToImportRow,
1983 Bcol2Ccol,Icol2Ccol,
1984 Ccolmap.getConst()->getLocalNumElements());
1985
1986 RCP<graph_t> graphC;
1987 typename KBSR::values_type values;
1988 {
1989 // Call KokkosSparse routines to calculate Amat*Bmerged on device.
1990 // NOTE: Need to scope guard this since the BlockCrs constructor will need to copy the host graph
1991 KBSR Cmat;
1992 KokkosSparse::block_spgemm_symbolic(kh, Amat, false, Bmerged, false, Cmat);
1993 KokkosSparse::block_spgemm_numeric (kh, Amat, false, Bmerged, false, Cmat);
1994 kh.destroy_spgemm_handle();
1995
1996 // Build Tpetra::BlockCrsMatrix from KokkosSparse::BsrMatrix
1997 graphC = rcp(new graph_t(Cmat.graph, Aview.origMatrix->getRowMap(), Ccolmap.getConst()));
1998 values = Cmat.values;
1999 }
2000 C = rcp (new block_crs_matrix_type (*graphC, values, Aview.blocksize));
2001
2002}
2003
2004/*********************************************************************************************************/
2005// AB NewMatrix Kernel wrappers (Default non-threaded version for CrsMatrix)
2006template<class Scalar,
2007 class LocalOrdinal,
2008 class GlobalOrdinal,
2009 class Node,
2010 class LocalOrdinalViewType>
2011void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2012 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2013 const LocalOrdinalViewType & targetMapToOrigRow,
2014 const LocalOrdinalViewType & targetMapToImportRow,
2015 const LocalOrdinalViewType & Bcol2Ccol,
2016 const LocalOrdinalViewType & Icol2Ccol,
2017 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2018 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2019 const std::string& label,
2020 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2021#ifdef HAVE_TPETRA_MMM_TIMINGS
2022 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2023 using Teuchos::TimeMonitor;
2024 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
2025#endif
2026
2027 using Teuchos::Array;
2028 using Teuchos::ArrayRCP;
2029 using Teuchos::ArrayView;
2030 using Teuchos::RCP;
2031 using Teuchos::rcp;
2032
2033 // Lots and lots of typedefs
2034 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2035 typedef typename KCRS::StaticCrsGraphType graph_t;
2036 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2037 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2038 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2039 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2040
2041 typedef Scalar SC;
2042 typedef LocalOrdinal LO;
2043 typedef GlobalOrdinal GO;
2044 typedef Node NO;
2045 typedef Map<LO,GO,NO> map_type;
2046 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2047 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2048 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2049
2050 // Sizes
2051 RCP<const map_type> Ccolmap = C.getColMap();
2052 size_t m = Aview.origMatrix->getLocalNumRows();
2053 size_t n = Ccolmap->getLocalNumElements();
2054 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2055
2056 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2057 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2058 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2059
2060 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2061 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2062 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2063
2064 c_lno_view_t Irowptr;
2065 lno_nnz_view_t Icolind;
2066 scalar_view_t Ivals;
2067 if(!Bview.importMatrix.is_null()) {
2068 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2069 Irowptr = lclB.graph.row_map;
2070 Icolind = lclB.graph.entries;
2071 Ivals = lclB.values;
2072 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2073 }
2074
2075#ifdef HAVE_TPETRA_MMM_TIMINGS
2076 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2077#endif
2078
2079 // Classic csr assembly (low memory edition)
2080 //
2081 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2082 // The method loops over rows of A, and may resize after processing
2083 // each row. Chris Siefert says that this reflects experience in
2084 // ML; for the non-threaded case, ML found it faster to spend less
2085 // effort on estimation and risk an occasional reallocation.
2086 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2087 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2088 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2089 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2090
2091 // mfh 27 Sep 2016: The c_status array is an implementation detail
2092 // of the local sparse matrix-matrix multiply routine.
2093
2094 // The status array will contain the index into colind where this entry was last deposited.
2095 // c_status[i] < CSR_ip - not in the row yet
2096 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2097 // We start with this filled with INVALID's indicating that there are no entries yet.
2098 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2099 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2100 std::vector<size_t> c_status(n, ST_INVALID);
2101
2102 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2103 // routine. The routine computes C := A * (B_local + B_remote).
2104 //
2105 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2106 // you whether the corresponding row of B belongs to B_local
2107 // ("orig") or B_remote ("Import").
2108
2109 // For each row of A/C
2110 size_t CSR_ip = 0, OLD_ip = 0;
2111 for (size_t i = 0; i < m; i++) {
2112 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2113 // on the calling process.
2114 Crowptr[i] = CSR_ip;
2115
2116 // mfh 27 Sep 2016: For each entry of A in the current row of A
2117 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2118 LO Aik = Acolind[k]; // local column index of current entry of A
2119 const SC Aval = Avals[k]; // value of current entry of A
2120 if (Aval == SC_ZERO)
2121 continue; // skip explicitly stored zero values in A
2122
2123 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2124 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2125 // corresponding to the current entry of A is populated, then
2126 // the corresponding row of B is in B_local (i.e., it lives on
2127 // the calling process).
2128
2129 // Local matrix
2130 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2131
2132 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2133 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2134 LO Bkj = Bcolind[j];
2135 LO Cij = Bcol2Ccol[Bkj];
2136
2137 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2138 // New entry
2139 c_status[Cij] = CSR_ip;
2140 Ccolind[CSR_ip] = Cij;
2141 Cvals[CSR_ip] = Aval*Bvals[j];
2142 CSR_ip++;
2143
2144 } else {
2145 Cvals[c_status[Cij]] += Aval*Bvals[j];
2146 }
2147 }
2148
2149 } else {
2150 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2151 // corresponding to the current entry of A NOT populated (has
2152 // a flag "invalid" value), then the corresponding row of B is
2153 // in B_local (i.e., it lives on the calling process).
2154
2155 // Remote matrix
2156 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2157 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2158 LO Ikj = Icolind[j];
2159 LO Cij = Icol2Ccol[Ikj];
2160
2161 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2162 // New entry
2163 c_status[Cij] = CSR_ip;
2164 Ccolind[CSR_ip] = Cij;
2165 Cvals[CSR_ip] = Aval*Ivals[j];
2166 CSR_ip++;
2167 } else {
2168 Cvals[c_status[Cij]] += Aval*Ivals[j];
2169 }
2170 }
2171 }
2172 }
2173
2174 // Resize for next pass if needed
2175 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2176 CSR_alloc *= 2;
2177 Kokkos::resize(Ccolind,CSR_alloc);
2178 Kokkos::resize(Cvals,CSR_alloc);
2179 }
2180 OLD_ip = CSR_ip;
2181 }
2182
2183 Crowptr[m] = CSR_ip;
2184
2185 // Downward resize
2186 Kokkos::resize(Ccolind,CSR_ip);
2187 Kokkos::resize(Cvals,CSR_ip);
2188
2189#ifdef HAVE_TPETRA_MMM_TIMINGS
2190 {
2191 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
2192#endif
2193
2194 // Final sort & set of CRS arrays
2195 if (params.is_null() || params->get("sort entries",true))
2196 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2197 C.setAllValues(Crowptr,Ccolind, Cvals);
2198
2199
2200#ifdef HAVE_TPETRA_MMM_TIMINGS
2201 }
2202 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
2203 {
2204#endif
2205
2206 // Final FillComplete
2207 //
2208 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2209 // Import (from domain Map to column Map) construction (which costs
2210 // lots of communication) by taking the previously constructed
2211 // Import object. We should be able to do this without interfering
2212 // with the implementation of the local part of sparse matrix-matrix
2213 // multply above.
2214 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2215 labelList->set("Timer Label",label);
2216 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2217 RCP<const Export<LO,GO,NO> > dummyExport;
2218 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2219#ifdef HAVE_TPETRA_MMM_TIMINGS
2220 }
2221 MM2 = Teuchos::null;
2222#endif
2223
2224}
2225/*********************************************************************************************************/
2226// Kernel method for computing the local portion of C = A*B (reuse)
2227template<class Scalar,
2228 class LocalOrdinal,
2229 class GlobalOrdinal,
2230 class Node>
2231void mult_A_B_reuse(
2232 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2233 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2234 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2235 const std::string& label,
2236 const Teuchos::RCP<Teuchos::ParameterList>& params)
2237{
2238 using Teuchos::Array;
2239 using Teuchos::ArrayRCP;
2240 using Teuchos::ArrayView;
2241 using Teuchos::RCP;
2242 using Teuchos::rcp;
2243
2244 // Tpetra typedefs
2245 typedef LocalOrdinal LO;
2246 typedef GlobalOrdinal GO;
2247 typedef Node NO;
2248 typedef Import<LO,GO,NO> import_type;
2249 typedef Map<LO,GO,NO> map_type;
2250
2251 // Kokkos typedefs
2252 typedef typename map_type::local_map_type local_map_type;
2254 typedef typename KCRS::StaticCrsGraphType graph_t;
2255 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2256 typedef typename NO::execution_space execution_space;
2257 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2258 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2259
2260#ifdef HAVE_TPETRA_MMM_TIMINGS
2261 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2262 using Teuchos::TimeMonitor;
2263 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
2264#endif
2265 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2266
2267 // Grab all the maps
2268 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2269 RCP<const map_type> Ccolmap = C.getColMap();
2270 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2271 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2272 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2273 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2274 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2275 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2276
2277 // Build the final importer / column map, hash table lookups for C
2278 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2279 {
2280 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2281 // So, column map of C may be a strict subset of the column map of B
2282 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2283 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2284 });
2285
2286 if (!Bview.importMatrix.is_null()) {
2287 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2288 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2289
2290 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2291 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2292 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2293 });
2294 }
2295 }
2296
2297 // Run through all the hash table lookups once and for all
2298 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2299 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2300 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2301 GO aidx = Acolmap_local.getGlobalElement(i);
2302 LO B_LID = Browmap_local.getLocalElement(aidx);
2303 if (B_LID != LO_INVALID) {
2304 targetMapToOrigRow(i) = B_LID;
2305 targetMapToImportRow(i) = LO_INVALID;
2306 } else {
2307 LO I_LID = Irowmap_local.getLocalElement(aidx);
2308 targetMapToOrigRow(i) = LO_INVALID;
2309 targetMapToImportRow(i) = I_LID;
2310
2311 }
2312 });
2313
2314 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2315 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2316 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2317}
2318
2319/*********************************************************************************************************/
2320template<class Scalar,
2321 class LocalOrdinal,
2322 class GlobalOrdinal,
2323 class Node,
2324 class LocalOrdinalViewType>
2325void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2326 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2327 const LocalOrdinalViewType & targetMapToOrigRow,
2328 const LocalOrdinalViewType & targetMapToImportRow,
2329 const LocalOrdinalViewType & Bcol2Ccol,
2330 const LocalOrdinalViewType & Icol2Ccol,
2331 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2332 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2333 const std::string& label,
2334 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2335#ifdef HAVE_TPETRA_MMM_TIMINGS
2336 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2337 using Teuchos::TimeMonitor;
2338 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2339 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2340#else
2341 (void)label;
2342#endif
2343 using Teuchos::RCP;
2344 using Teuchos::rcp;
2345
2346
2347 // Lots and lots of typedefs
2348 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2349 typedef typename KCRS::StaticCrsGraphType graph_t;
2350 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2351 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2352 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2353
2354 typedef Scalar SC;
2355 typedef LocalOrdinal LO;
2356 typedef GlobalOrdinal GO;
2357 typedef Node NO;
2358 typedef Map<LO,GO,NO> map_type;
2359 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2360 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2361 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2362
2363 // Sizes
2364 RCP<const map_type> Ccolmap = C.getColMap();
2365 size_t m = Aview.origMatrix->getLocalNumRows();
2366 size_t n = Ccolmap->getLocalNumElements();
2367
2368 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2369 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2370 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2371 const KCRS & Cmat = C.getLocalMatrixHost();
2372
2373 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2374 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2375 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2376 scalar_view_t Cvals = Cmat.values;
2377
2378 c_lno_view_t Irowptr;
2379 lno_nnz_view_t Icolind;
2380 scalar_view_t Ivals;
2381 if(!Bview.importMatrix.is_null()) {
2382 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2383 Irowptr = lclB.graph.row_map;
2384 Icolind = lclB.graph.entries;
2385 Ivals = lclB.values;
2386 }
2387
2388#ifdef HAVE_TPETRA_MMM_TIMINGS
2389 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2390#endif
2391
2392 // Classic csr assembly (low memory edition)
2393 // mfh 27 Sep 2016: The c_status array is an implementation detail
2394 // of the local sparse matrix-matrix multiply routine.
2395
2396 // The status array will contain the index into colind where this entry was last deposited.
2397 // c_status[i] < CSR_ip - not in the row yet
2398 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2399 // We start with this filled with INVALID's indicating that there are no entries yet.
2400 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2401 std::vector<size_t> c_status(n, ST_INVALID);
2402
2403 // For each row of A/C
2404 size_t CSR_ip = 0, OLD_ip = 0;
2405 for (size_t i = 0; i < m; i++) {
2406 // First fill the c_status array w/ locations where we're allowed to
2407 // generate nonzeros for this row
2408 OLD_ip = Crowptr[i];
2409 CSR_ip = Crowptr[i+1];
2410 for (size_t k = OLD_ip; k < CSR_ip; k++) {
2411 c_status[Ccolind[k]] = k;
2412
2413 // Reset values in the row of C
2414 Cvals[k] = SC_ZERO;
2415 }
2416
2417 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2418 LO Aik = Acolind[k];
2419 const SC Aval = Avals[k];
2420 if (Aval == SC_ZERO)
2421 continue;
2422
2423 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2424 // Local matrix
2425 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2426
2427 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2428 LO Bkj = Bcolind[j];
2429 LO Cij = Bcol2Ccol[Bkj];
2430
2431 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2432 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2433 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2434
2435 Cvals[c_status[Cij]] += Aval * Bvals[j];
2436 }
2437
2438 } else {
2439 // Remote matrix
2440 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2441 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2442 LO Ikj = Icolind[j];
2443 LO Cij = Icol2Ccol[Ikj];
2444
2445 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2446 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2447 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2448
2449 Cvals[c_status[Cij]] += Aval * Ivals[j];
2450 }
2451 }
2452 }
2453 }
2454
2455#ifdef HAVE_TPETRA_MMM_TIMINGS
2456 auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2457#endif
2458
2459 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2460}
2461
2462
2463/*********************************************************************************************************/
2464// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2465template<class Scalar,
2466 class LocalOrdinal,
2467 class GlobalOrdinal,
2468 class Node>
2469void jacobi_A_B_newmatrix(
2470 Scalar omega,
2471 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2472 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2473 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2474 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2475 const std::string& label,
2476 const Teuchos::RCP<Teuchos::ParameterList>& params)
2477{
2478 using Teuchos::Array;
2479 using Teuchos::ArrayRCP;
2480 using Teuchos::ArrayView;
2481 using Teuchos::RCP;
2482 using Teuchos::rcp;
2483 // typedef Scalar SC;
2484 typedef LocalOrdinal LO;
2485 typedef GlobalOrdinal GO;
2486 typedef Node NO;
2487
2488 typedef Import<LO,GO,NO> import_type;
2489 typedef Map<LO,GO,NO> map_type;
2490 typedef typename map_type::local_map_type local_map_type;
2491
2492 // All of the Kokkos typedefs
2494 typedef typename KCRS::StaticCrsGraphType graph_t;
2495 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2496 typedef typename NO::execution_space execution_space;
2497 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2498 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2499
2500
2501#ifdef HAVE_TPETRA_MMM_TIMINGS
2502 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2503 using Teuchos::TimeMonitor;
2504 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2505#endif
2506 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2507
2508 // Build the final importer / column map, hash table lookups for C
2509 RCP<const import_type> Cimport;
2510 RCP<const map_type> Ccolmap;
2511 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2512 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2513 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2514 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2515 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2516 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2517 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2518 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2519
2520 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2521 // indices of B, to local column indices of C. (B and C have the
2522 // same number of columns.) The kernel uses this, instead of
2523 // copying the entire input matrix B and converting its column
2524 // indices to those of C.
2525 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2526
2527 if (Bview.importMatrix.is_null()) {
2528 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2529 Cimport = Bimport;
2530 Ccolmap = Bview.colMap;
2531 // Bcol2Ccol is trivial
2532 // Bcol2Ccol is trivial
2533
2534 Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getLocalNumElements ()));
2535 Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2536 Bcol2Ccol(i) = static_cast<LO> (i);
2537 });
2538 } else {
2539 // mfh 27 Sep 2016: B has "remotes," so we need to build the
2540 // column Map of C, as well as C's Import object (from its domain
2541 // Map to its column Map). C's column Map is the union of the
2542 // column Maps of (the local part of) B, and the "remote" part of
2543 // B. Ditto for the Import. We have optimized this "setUnion"
2544 // operation on Import objects and Maps.
2545
2546 // Choose the right variant of setUnion
2547 if (!Bimport.is_null() && !Iimport.is_null()){
2548 Cimport = Bimport->setUnion(*Iimport);
2549 Ccolmap = Cimport->getTargetMap();
2550
2551 } else if (!Bimport.is_null() && Iimport.is_null()) {
2552 Cimport = Bimport->setUnion();
2553
2554 } else if(Bimport.is_null() && !Iimport.is_null()) {
2555 Cimport = Iimport->setUnion();
2556
2557 } else
2558 throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2559
2560 Ccolmap = Cimport->getTargetMap();
2561
2562 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2563 std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2564
2565 // NOTE: This is not efficient and should be folded into setUnion
2566 //
2567 // mfh 27 Sep 2016: What the above comment means, is that the
2568 // setUnion operation on Import objects could also compute these
2569 // local index - to - local index look-up tables.
2570 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2571 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2572 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2573 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2574 });
2575 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2576 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2577 });
2578
2579 }
2580
2581 // Replace the column map
2582 //
2583 // mfh 27 Sep 2016: We do this because C was originally created
2584 // without a column Map. Now we have its column Map.
2585 C.replaceColMap(Ccolmap);
2586
2587 // mfh 27 Sep 2016: Construct tables that map from local column
2588 // indices of A, to local row indices of either B_local (the locally
2589 // owned part of B), or B_remote (the "imported" remote part of B).
2590 //
2591 // For column index Aik in row i of A, if the corresponding row of B
2592 // exists in the local part of B ("orig") (which I'll call B_local),
2593 // then targetMapToOrigRow[Aik] is the local index of that row of B.
2594 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2595 //
2596 // For column index Aik in row i of A, if the corresponding row of B
2597 // exists in the remote part of B ("Import") (which I'll call
2598 // B_remote), then targetMapToImportRow[Aik] is the local index of
2599 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2600 // (a flag value).
2601
2602 // Run through all the hash table lookups once and for all
2603 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2604 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2605 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2606 GO aidx = Acolmap_local.getGlobalElement(i);
2607 LO B_LID = Browmap_local.getLocalElement(aidx);
2608 if (B_LID != LO_INVALID) {
2609 targetMapToOrigRow(i) = B_LID;
2610 targetMapToImportRow(i) = LO_INVALID;
2611 } else {
2612 LO I_LID = Irowmap_local.getLocalElement(aidx);
2613 targetMapToOrigRow(i) = LO_INVALID;
2614 targetMapToImportRow(i) = I_LID;
2615
2616 }
2617 });
2618
2619 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2620 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2621 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2622
2623}
2624
2625
2626/*********************************************************************************************************/
2627// Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2628// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2629
2630template<class Scalar,
2631 class LocalOrdinal,
2632 class GlobalOrdinal,
2633 class Node,
2634 class LocalOrdinalViewType>
2635void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2636 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2637 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2638 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2639 const LocalOrdinalViewType & targetMapToOrigRow,
2640 const LocalOrdinalViewType & targetMapToImportRow,
2641 const LocalOrdinalViewType & Bcol2Ccol,
2642 const LocalOrdinalViewType & Icol2Ccol,
2643 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2644 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2645 const std::string& label,
2646 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2647
2648#ifdef HAVE_TPETRA_MMM_TIMINGS
2649 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2650 using Teuchos::TimeMonitor;
2651 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2652#endif
2653
2654 using Teuchos::Array;
2655 using Teuchos::ArrayRCP;
2656 using Teuchos::ArrayView;
2657 using Teuchos::RCP;
2658 using Teuchos::rcp;
2659
2660 // Lots and lots of typedefs
2661 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2662 typedef typename KCRS::StaticCrsGraphType graph_t;
2663 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2664 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2665 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2666 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2667
2668 // Jacobi-specific
2669 typedef typename scalar_view_t::memory_space scalar_memory_space;
2670
2671 typedef Scalar SC;
2672 typedef LocalOrdinal LO;
2673 typedef GlobalOrdinal GO;
2674 typedef Node NO;
2675
2676 typedef Map<LO,GO,NO> map_type;
2677 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2678 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2679
2680 // Sizes
2681 RCP<const map_type> Ccolmap = C.getColMap();
2682 size_t m = Aview.origMatrix->getLocalNumRows();
2683 size_t n = Ccolmap->getLocalNumElements();
2684 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2685
2686 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2687 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2688 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2689
2690 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2691 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2692 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2693
2694 c_lno_view_t Irowptr;
2695 lno_nnz_view_t Icolind;
2696 scalar_view_t Ivals;
2697 if(!Bview.importMatrix.is_null()) {
2698 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2699 Irowptr = lclB.graph.row_map;
2700 Icolind = lclB.graph.entries;
2701 Ivals = lclB.values;
2702 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2703 }
2704
2705 // Jacobi-specific inner stuff
2706 auto Dvals =
2707 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2708
2709 // Teuchos::ArrayView::operator[].
2710 // The status array will contain the index into colind where this entry was last deposited.
2711 // c_status[i] < CSR_ip - not in the row yet.
2712 // c_status[i] >= CSR_ip, this is the entry where you can find the data
2713 // We start with this filled with INVALID's indicating that there are no entries yet.
2714 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2715 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2716 Array<size_t> c_status(n, ST_INVALID);
2717
2718 // Classic csr assembly (low memory edition)
2719 //
2720 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2721 // The method loops over rows of A, and may resize after processing
2722 // each row. Chris Siefert says that this reflects experience in
2723 // ML; for the non-threaded case, ML found it faster to spend less
2724 // effort on estimation and risk an occasional reallocation.
2725 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2726 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2727 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2728 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2729 size_t CSR_ip = 0, OLD_ip = 0;
2730
2731 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2732
2733 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2734 // routine. The routine computes
2735 //
2736 // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2737 //
2738 // This corresponds to one sweep of (weighted) Jacobi.
2739 //
2740 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2741 // you whether the corresponding row of B belongs to B_local
2742 // ("orig") or B_remote ("Import").
2743
2744 // For each row of A/C
2745 for (size_t i = 0; i < m; i++) {
2746 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2747 // on the calling process.
2748 Crowptr[i] = CSR_ip;
2749 SC minusOmegaDval = -omega*Dvals(i,0);
2750
2751 // Entries of B
2752 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2753 Scalar Bval = Bvals[j];
2754 if (Bval == SC_ZERO)
2755 continue;
2756 LO Bij = Bcolind[j];
2757 LO Cij = Bcol2Ccol[Bij];
2758
2759 // Assume no repeated entries in B
2760 c_status[Cij] = CSR_ip;
2761 Ccolind[CSR_ip] = Cij;
2762 Cvals[CSR_ip] = Bvals[j];
2763 CSR_ip++;
2764 }
2765
2766 // Entries of -omega * Dinv * A * B
2767 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2768 LO Aik = Acolind[k];
2769 const SC Aval = Avals[k];
2770 if (Aval == SC_ZERO)
2771 continue;
2772
2773 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2774 // Local matrix
2775 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2776
2777 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2778 LO Bkj = Bcolind[j];
2779 LO Cij = Bcol2Ccol[Bkj];
2780
2781 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2782 // New entry
2783 c_status[Cij] = CSR_ip;
2784 Ccolind[CSR_ip] = Cij;
2785 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2786 CSR_ip++;
2787
2788 } else {
2789 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2790 }
2791 }
2792
2793 } else {
2794 // Remote matrix
2795 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2796 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2797 LO Ikj = Icolind[j];
2798 LO Cij = Icol2Ccol[Ikj];
2799
2800 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2801 // New entry
2802 c_status[Cij] = CSR_ip;
2803 Ccolind[CSR_ip] = Cij;
2804 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2805 CSR_ip++;
2806 } else {
2807 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2808 }
2809 }
2810 }
2811 }
2812
2813 // Resize for next pass if needed
2814 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2815 CSR_alloc *= 2;
2816 Kokkos::resize(Ccolind,CSR_alloc);
2817 Kokkos::resize(Cvals,CSR_alloc);
2818 }
2819 OLD_ip = CSR_ip;
2820 }
2821 Crowptr[m] = CSR_ip;
2822
2823 // Downward resize
2824 Kokkos::resize(Ccolind,CSR_ip);
2825 Kokkos::resize(Cvals,CSR_ip);
2826
2827 {
2828#ifdef HAVE_TPETRA_MMM_TIMINGS
2829 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2830#endif
2831
2832 // Replace the column map
2833 //
2834 // mfh 27 Sep 2016: We do this because C was originally created
2835 // without a column Map. Now we have its column Map.
2836 C.replaceColMap(Ccolmap);
2837
2838 // Final sort & set of CRS arrays
2839 //
2840 // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2841 // matrix-matrix multiply routine sort the entries for us?
2842 // Final sort & set of CRS arrays
2843 if (params.is_null() || params->get("sort entries",true))
2844 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2845 C.setAllValues(Crowptr,Ccolind, Cvals);
2846 }
2847 {
2848#ifdef HAVE_TPETRA_MMM_TIMINGS
2849 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2850#endif
2851
2852 // Final FillComplete
2853 //
2854 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2855 // Import (from domain Map to column Map) construction (which costs
2856 // lots of communication) by taking the previously constructed
2857 // Import object. We should be able to do this without interfering
2858 // with the implementation of the local part of sparse matrix-matrix
2859 // multply above
2860 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2861 labelList->set("Timer Label",label);
2862 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2863 RCP<const Export<LO,GO,NO> > dummyExport;
2864 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2865
2866 }
2867}
2868
2869
2870/*********************************************************************************************************/
2871// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2872template<class Scalar,
2873 class LocalOrdinal,
2874 class GlobalOrdinal,
2875 class Node>
2876void jacobi_A_B_reuse(
2877 Scalar omega,
2878 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2879 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2880 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2881 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2882 const std::string& label,
2883 const Teuchos::RCP<Teuchos::ParameterList>& params)
2884{
2885 using Teuchos::Array;
2886 using Teuchos::ArrayRCP;
2887 using Teuchos::ArrayView;
2888 using Teuchos::RCP;
2889 using Teuchos::rcp;
2890
2891 typedef LocalOrdinal LO;
2892 typedef GlobalOrdinal GO;
2893 typedef Node NO;
2894
2895 typedef Import<LO,GO,NO> import_type;
2896 typedef Map<LO,GO,NO> map_type;
2897
2898 // Kokkos typedefs
2899 typedef typename map_type::local_map_type local_map_type;
2901 typedef typename KCRS::StaticCrsGraphType graph_t;
2902 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2903 typedef typename NO::execution_space execution_space;
2904 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2905 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2906
2907#ifdef HAVE_TPETRA_MMM_TIMINGS
2908 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2909 using Teuchos::TimeMonitor;
2910 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2911#endif
2912 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2913
2914 // Grab all the maps
2915 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2916 RCP<const map_type> Ccolmap = C.getColMap();
2917 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2918 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2919 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2920 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2921 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2922 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2923
2924 // Build the final importer / column map, hash table lookups for C
2925 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2926 {
2927 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2928 // So, column map of C may be a strict subset of the column map of B
2929 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2930 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2931 });
2932
2933 if (!Bview.importMatrix.is_null()) {
2934 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2935 std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2936
2937 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2938 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2939 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2940 });
2941 }
2942 }
2943
2944 // Run through all the hash table lookups once and for all
2945 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2946 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2947 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2948 GO aidx = Acolmap_local.getGlobalElement(i);
2949 LO B_LID = Browmap_local.getLocalElement(aidx);
2950 if (B_LID != LO_INVALID) {
2951 targetMapToOrigRow(i) = B_LID;
2952 targetMapToImportRow(i) = LO_INVALID;
2953 } else {
2954 LO I_LID = Irowmap_local.getLocalElement(aidx);
2955 targetMapToOrigRow(i) = LO_INVALID;
2956 targetMapToImportRow(i) = I_LID;
2957
2958 }
2959 });
2960
2961#ifdef HAVE_TPETRA_MMM_TIMINGS
2962 MM = Teuchos::null;
2963#endif
2964
2965 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2966 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2967 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2968}
2969
2970
2971
2972/*********************************************************************************************************/
2973template<class Scalar,
2974 class LocalOrdinal,
2975 class GlobalOrdinal,
2976 class Node,
2977 class LocalOrdinalViewType>
2978void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2979 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2980 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2981 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2982 const LocalOrdinalViewType & targetMapToOrigRow,
2983 const LocalOrdinalViewType & targetMapToImportRow,
2984 const LocalOrdinalViewType & Bcol2Ccol,
2985 const LocalOrdinalViewType & Icol2Ccol,
2986 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2987 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2988 const std::string& label,
2989 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2990#ifdef HAVE_TPETRA_MMM_TIMINGS
2991 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2992 using Teuchos::TimeMonitor;
2993 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2994 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2995#else
2996 (void)label;
2997#endif
2998 using Teuchos::RCP;
2999 using Teuchos::rcp;
3000
3001 // Lots and lots of typedefs
3002 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
3003 typedef typename KCRS::StaticCrsGraphType graph_t;
3004 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
3005 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3006 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3007 typedef typename scalar_view_t::memory_space scalar_memory_space;
3008
3009 typedef Scalar SC;
3010 typedef LocalOrdinal LO;
3011 typedef GlobalOrdinal GO;
3012 typedef Node NO;
3013 typedef Map<LO,GO,NO> map_type;
3014 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3015 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3016 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
3017
3018 // Sizes
3019 RCP<const map_type> Ccolmap = C.getColMap();
3020 size_t m = Aview.origMatrix->getLocalNumRows();
3021 size_t n = Ccolmap->getLocalNumElements();
3022
3023 // Grab the Kokkos::SparseCrsMatrices & inner stuff
3024 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
3025 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
3026 const KCRS & Cmat = C.getLocalMatrixHost();
3027
3028 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
3029 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
3030 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
3031 scalar_view_t Cvals = Cmat.values;
3032
3033 c_lno_view_t Irowptr;
3034 lno_nnz_view_t Icolind;
3035 scalar_view_t Ivals;
3036 if(!Bview.importMatrix.is_null()) {
3037 auto lclB = Bview.importMatrix->getLocalMatrixHost();
3038 Irowptr = lclB.graph.row_map;
3039 Icolind = lclB.graph.entries;
3040 Ivals = lclB.values;
3041 }
3042
3043 // Jacobi-specific inner stuff
3044 auto Dvals =
3045 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
3046
3047#ifdef HAVE_TPETRA_MMM_TIMINGS
3048 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
3049#endif
3050
3051 // The status array will contain the index into colind where this entry was last deposited.
3052 // c_status[i] < CSR_ip - not in the row yet
3053 // c_status[i] >= CSR_ip - this is the entry where you can find the data
3054 // We start with this filled with INVALID's indicating that there are no entries yet.
3055 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
3056 std::vector<size_t> c_status(n, ST_INVALID);
3057
3058 // For each row of A/C
3059 size_t CSR_ip = 0, OLD_ip = 0;
3060 for (size_t i = 0; i < m; i++) {
3061
3062 // First fill the c_status array w/ locations where we're allowed to
3063 // generate nonzeros for this row
3064 OLD_ip = Crowptr[i];
3065 CSR_ip = Crowptr[i+1];
3066 for (size_t k = OLD_ip; k < CSR_ip; k++) {
3067 c_status[Ccolind[k]] = k;
3068
3069 // Reset values in the row of C
3070 Cvals[k] = SC_ZERO;
3071 }
3072
3073 SC minusOmegaDval = -omega*Dvals(i,0);
3074
3075 // Entries of B
3076 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
3077 Scalar Bval = Bvals[j];
3078 if (Bval == SC_ZERO)
3079 continue;
3080 LO Bij = Bcolind[j];
3081 LO Cij = Bcol2Ccol[Bij];
3082
3083 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3084 std::runtime_error, "Trying to insert a new entry into a static graph");
3085
3086 Cvals[c_status[Cij]] = Bvals[j];
3087 }
3088
3089 // Entries of -omega * Dinv * A * B
3090 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3091 LO Aik = Acolind[k];
3092 const SC Aval = Avals[k];
3093 if (Aval == SC_ZERO)
3094 continue;
3095
3096 if (targetMapToOrigRow[Aik] != LO_INVALID) {
3097 // Local matrix
3098 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
3099
3100 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3101 LO Bkj = Bcolind[j];
3102 LO Cij = Bcol2Ccol[Bkj];
3103
3104 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3105 std::runtime_error, "Trying to insert a new entry into a static graph");
3106
3107 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3108 }
3109
3110 } else {
3111 // Remote matrix
3112 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
3113 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3114 LO Ikj = Icolind[j];
3115 LO Cij = Icol2Ccol[Ikj];
3116
3117 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3118 std::runtime_error, "Trying to insert a new entry into a static graph");
3119
3120 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3121 }
3122 }
3123 }
3124 }
3125
3126#ifdef HAVE_TPETRA_MMM_TIMINGS
3127 MM2 = Teuchos::null;
3128 MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
3129#endif
3130
3131 C.fillComplete(C.getDomainMap(), C.getRangeMap());
3132#ifdef HAVE_TPETRA_MMM_TIMINGS
3133 MM2 = Teuchos::null;
3134 MM = Teuchos::null;
3135#endif
3136
3137}
3138
3139
3140
3141/*********************************************************************************************************/
3142template<class Scalar,
3143 class LocalOrdinal,
3144 class GlobalOrdinal,
3145 class Node>
3146void import_and_extract_views(
3147 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3148 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3149 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3150 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3151 bool userAssertsThereAreNoRemotes,
3152 const std::string& label,
3153 const Teuchos::RCP<Teuchos::ParameterList>& params)
3154{
3155 using Teuchos::Array;
3156 using Teuchos::ArrayView;
3157 using Teuchos::RCP;
3158 using Teuchos::rcp;
3159 using Teuchos::null;
3160
3161 typedef Scalar SC;
3162 typedef LocalOrdinal LO;
3163 typedef GlobalOrdinal GO;
3164 typedef Node NO;
3165
3166 typedef Map<LO,GO,NO> map_type;
3167 typedef Import<LO,GO,NO> import_type;
3168 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3169
3170#ifdef HAVE_TPETRA_MMM_TIMINGS
3171 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3172 using Teuchos::TimeMonitor;
3173 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
3174#endif
3175 // The goal of this method is to populate the 'Aview' struct with views of the
3176 // rows of A, including all rows that correspond to elements in 'targetMap'.
3177 //
3178 // If targetMap includes local elements that correspond to remotely-owned rows
3179 // of A, then those remotely-owned rows will be imported into
3180 // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3181 Aview.deleteContents();
3182
3183 Aview.origMatrix = rcp(&A, false);
3184 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3185 Aview.origMatrix->getApplyHelper();
3186 Aview.origRowMap = A.getRowMap();
3187 Aview.rowMap = targetMap;
3188 Aview.colMap = A.getColMap();
3189 Aview.domainMap = A.getDomainMap();
3190 Aview.importColMap = null;
3191 RCP<const map_type> rowMap = A.getRowMap();
3192 const int numProcs = rowMap->getComm()->getSize();
3193
3194 // Short circuit if the user swears there are no remotes (or if we're in serial)
3195 if (userAssertsThereAreNoRemotes || numProcs < 2)
3196 return;
3197
3198 RCP<const import_type> importer;
3199 if (params != null && params->isParameter("importer")) {
3200 importer = params->get<RCP<const import_type> >("importer");
3201
3202 } else {
3203#ifdef HAVE_TPETRA_MMM_TIMINGS
3204 MM = Teuchos::null;
3205 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
3206#endif
3207
3208 // Mark each row in targetMap as local or remote, and go ahead and get a view
3209 // for the local rows
3210 RCP<const map_type> remoteRowMap;
3211 size_t numRemote = 0;
3212 int mode = 0;
3213 if (!prototypeImporter.is_null() &&
3214 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3215 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3216 // We have a valid prototype importer --- ask it for the remotes
3217#ifdef HAVE_TPETRA_MMM_TIMINGS
3218 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
3219#endif
3220 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3221 numRemote = prototypeImporter->getNumRemoteIDs();
3222
3223 Array<GO> remoteRows(numRemote);
3224 for (size_t i = 0; i < numRemote; i++)
3225 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3226
3227 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3228 rowMap->getIndexBase(), rowMap->getComm()));
3229 mode = 1;
3230
3231 } else if (prototypeImporter.is_null()) {
3232 // No prototype importer --- count the remotes the hard way
3233#ifdef HAVE_TPETRA_MMM_TIMINGS
3234 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
3235#endif
3236 ArrayView<const GO> rows = targetMap->getLocalElementList();
3237 size_t numRows = targetMap->getLocalNumElements();
3238
3239 Array<GO> remoteRows(numRows);
3240 for(size_t i = 0; i < numRows; ++i) {
3241 const LO mlid = rowMap->getLocalElement(rows[i]);
3242
3243 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3244 remoteRows[numRemote++] = rows[i];
3245 }
3246 remoteRows.resize(numRemote);
3247 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3248 rowMap->getIndexBase(), rowMap->getComm()));
3249 mode = 2;
3250
3251 } else {
3252 // PrototypeImporter is bad. But if we're in serial that's OK.
3253 mode = 3;
3254 }
3255
3256 if (numProcs < 2) {
3257 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3258 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3259 // If only one processor we don't need to import any remote rows, so return.
3260 return;
3261 }
3262
3263 //
3264 // Now we will import the needed remote rows of A, if the global maximum
3265 // value of numRemote is greater than 0.
3266 //
3267#ifdef HAVE_TPETRA_MMM_TIMINGS
3268 MM = Teuchos::null;
3269 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
3270#endif
3271
3272 global_size_t globalMaxNumRemote = 0;
3273 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3274
3275 if (globalMaxNumRemote > 0) {
3276#ifdef HAVE_TPETRA_MMM_TIMINGS
3277 MM = Teuchos::null;
3278 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
3279#endif
3280 // Create an importer with target-map remoteRowMap and source-map rowMap.
3281 if (mode == 1)
3282 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3283 else if (mode == 2)
3284 importer = rcp(new import_type(rowMap, remoteRowMap));
3285 else
3286 throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3287 }
3288
3289 if (params != null)
3290 params->set("importer", importer);
3291 }
3292
3293 if (importer != null) {
3294#ifdef HAVE_TPETRA_MMM_TIMINGS
3295 MM = Teuchos::null;
3296 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3297#endif
3298
3299 // Now create a new matrix into which we can import the remote rows of A that we need.
3300 Teuchos::ParameterList labelList;
3301 labelList.set("Timer Label", label);
3302 auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3303
3304 bool isMM = true;
3305 bool overrideAllreduce = false;
3306 int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3307 // Minor speedup tweak - avoid computing the global constants
3308 Teuchos::ParameterList params_sublist;
3309 if(!params.is_null()) {
3310 labelList.set("compute global constants", params->get("compute global constants",false));
3311 params_sublist = params->sublist("matrixmatrix: kernel params",false);
3312 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3313 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3314 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3315 isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3316 overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3317 }
3318 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3319 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3320 labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3321
3322 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3323 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3324 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3325 Aview.importMatrix->getApplyHelper();
3326
3327#if 0
3328 // Disabled code for dumping input matrices
3329 static int count=0;
3330 char str[80];
3331 sprintf(str,"import_matrix.%d.dat",count);
3333 count++;
3334#endif
3335
3336#ifdef HAVE_TPETRA_MMM_STATISTICS
3337 printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3338#endif
3339
3340
3341#ifdef HAVE_TPETRA_MMM_TIMINGS
3342 MM = Teuchos::null;
3343 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3344#endif
3345
3346 // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3347 Aview.importColMap = Aview.importMatrix->getColMap();
3348#ifdef HAVE_TPETRA_MMM_TIMINGS
3349 MM = Teuchos::null;
3350#endif
3351 }
3352}
3353
3354/*********************************************************************************************************/
3355template<class Scalar,
3356 class LocalOrdinal,
3357 class GlobalOrdinal,
3358 class Node>
3359void import_and_extract_views(
3360 const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
3361 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3362 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
3363 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3364 bool userAssertsThereAreNoRemotes)
3365{
3366 using Teuchos::Array;
3367 using Teuchos::ArrayView;
3368 using Teuchos::RCP;
3369 using Teuchos::rcp;
3370 using Teuchos::null;
3371
3372 typedef Scalar SC;
3373 typedef LocalOrdinal LO;
3374 typedef GlobalOrdinal GO;
3375 typedef Node NO;
3376
3377 typedef Map<LO,GO,NO> map_type;
3378 typedef Import<LO,GO,NO> import_type;
3379 typedef BlockCrsMatrix<SC,LO,GO,NO> blockcrs_matrix_type;
3380
3381 // The goal of this method is to populate the 'Mview' struct with views of the
3382 // rows of M, including all rows that correspond to elements in 'targetMap'.
3383 //
3384 // If targetMap includes local elements that correspond to remotely-owned rows
3385 // of M, then those remotely-owned rows will be imported into
3386 // 'Mview.importMatrix', and views of them will be included in 'Mview'.
3387 Mview.deleteContents();
3388
3389 Mview.origMatrix = rcp(&M, false);
3390 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3391 Mview.origMatrix->getApplyHelper();
3392 Mview.origRowMap = M.getRowMap();
3393 Mview.rowMap = targetMap;
3394 Mview.colMap = M.getColMap();
3395 Mview.importColMap = null;
3396 RCP<const map_type> rowMap = M.getRowMap();
3397 const int numProcs = rowMap->getComm()->getSize();
3398
3399 // Short circuit if the user swears there are no remotes (or if we're in serial)
3400 if (userAssertsThereAreNoRemotes || numProcs < 2) return;
3401
3402 // Mark each row in targetMap as local or remote, and go ahead and get a view
3403 // for the local rows
3404 RCP<const map_type> remoteRowMap;
3405 size_t numRemote = 0;
3406 int mode = 0;
3407 if (!prototypeImporter.is_null() &&
3408 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3409 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3410
3411 // We have a valid prototype importer --- ask it for the remotes
3412 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3413 numRemote = prototypeImporter->getNumRemoteIDs();
3414
3415 Array<GO> remoteRows(numRemote);
3416 for (size_t i = 0; i < numRemote; i++)
3417 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3418
3419 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3420 rowMap->getIndexBase(), rowMap->getComm()));
3421 mode = 1;
3422
3423 } else if (prototypeImporter.is_null()) {
3424
3425 // No prototype importer --- count the remotes the hard way
3426 ArrayView<const GO> rows = targetMap->getLocalElementList();
3427 size_t numRows = targetMap->getLocalNumElements();
3428
3429 Array<GO> remoteRows(numRows);
3430 for(size_t i = 0; i < numRows; ++i) {
3431 const LO mlid = rowMap->getLocalElement(rows[i]);
3432
3433 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3434 remoteRows[numRemote++] = rows[i];
3435 }
3436 remoteRows.resize(numRemote);
3437 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3438 rowMap->getIndexBase(), rowMap->getComm()));
3439 mode = 2;
3440
3441 } else {
3442 // PrototypeImporter is bad. But if we're in serial that's OK.
3443 mode = 3;
3444 }
3445
3446 if (numProcs < 2) {
3447 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3448 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3449 // If only one processor we don't need to import any remote rows, so return.
3450 return;
3451 }
3452
3453 // Now we will import the needed remote rows of M, if the global maximum
3454 // value of numRemote is greater than 0.
3455 global_size_t globalMaxNumRemote = 0;
3456 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3457
3458 RCP<const import_type> importer;
3459
3460 if (globalMaxNumRemote > 0) {
3461 // Create an importer with target-map remoteRowMap and source-map rowMap.
3462 if (mode == 1)
3463 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3464 else if (mode == 2)
3465 importer = rcp(new import_type(rowMap, remoteRowMap));
3466 else
3467 throw std::runtime_error("prototypeImporter->SourceMap() does not match M.getRowMap()!");
3468 }
3469
3470 if (importer != null) {
3471 // Get import matrix
3472 // TODO: create the int-typed row-pointer here
3473 Mview.importMatrix = Tpetra::importAndFillCompleteBlockCrsMatrix<blockcrs_matrix_type>(rcpFromRef(M), *importer);
3474 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3475 Mview.importMatrix->getApplyHelper();
3476 // Save the column map of the imported matrix, so that we can convert indices
3477 // back to global for arithmetic later
3478 Mview.importColMap = Mview.importMatrix->getColMap();
3479 }
3480}
3481
3482/*********************************************************************************************************/
3483 // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3484template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3486merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3487 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3488 const LocalOrdinalViewType & Acol2Brow,
3489 const LocalOrdinalViewType & Acol2Irow,
3490 const LocalOrdinalViewType & Bcol2Ccol,
3491 const LocalOrdinalViewType & Icol2Ccol,
3492 const size_t mergedNodeNumCols) {
3493
3494 using Teuchos::RCP;
3496 typedef typename KCRS::StaticCrsGraphType graph_t;
3497 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3498 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3499 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3500 // Grab the Kokkos::SparseCrsMatrices
3501 const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3502 const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3503
3504 // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3505 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3506 // We do have a Bimport
3507 // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3508 // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3509 RCP<const KCRS> Ik_;
3510 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3511 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3512 KCRS Iks;
3513 if(Ik!=0) Iks = *Ik;
3514 size_t merge_numrows = Ak.numCols();
3515 // The last entry of this at least, need to be initialized
3516 lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3517
3518 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3519
3520 // Use a Kokkos::parallel_scan to build the rowptr
3521 typedef typename Node::execution_space execution_space;
3522 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3523 Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3524 KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3525 if(final) Mrowptr(i) = update;
3526 // Get the row count
3527 size_t ct=0;
3528 if(Acol2Brow(i)!=LO_INVALID)
3529 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3530 else
3531 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3532 update+=ct;
3533
3534 if(final && i+1==merge_numrows)
3535 Mrowptr(i+1)=update;
3536 });
3537
3538 // Allocate nnz
3539 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3540 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3541 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3542
3543 // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3544 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3545 Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3546 if(Acol2Brow(i)!=LO_INVALID) {
3547 size_t row = Acol2Brow(i);
3548 size_t start = Bk.graph.row_map(row);
3549 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3550 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3551 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3552 }
3553 }
3554 else {
3555 size_t row = Acol2Irow(i);
3556 size_t start = Iks.graph.row_map(row);
3557 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3558 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3559 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3560 }
3561 }
3562 });
3563
3564 KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3565 return newmat;
3566 }
3567 else {
3568 // We don't have a Bimport (the easy case)
3569 return Bk;
3570 }
3571}//end merge_matrices
3572
3573/*********************************************************************************************************/
3574 // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3575template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3576const typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type
3577merge_matrices(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3578 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3579 const LocalOrdinalViewType & Acol2Brow,
3580 const LocalOrdinalViewType & Acol2Irow,
3581 const LocalOrdinalViewType & Bcol2Ccol,
3582 const LocalOrdinalViewType & Icol2Ccol,
3583 const size_t mergedNodeNumCols)
3584{
3585 using Teuchos::RCP;
3586 typedef typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KBCRS;
3587 typedef typename KBCRS::StaticCrsGraphType graph_t;
3588 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3589 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3590 typedef typename KBCRS::values_type::non_const_type scalar_view_t;
3591
3592 // Grab the KokkosSparse::BsrMatrix
3593 const KBCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3594 const KBCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3595
3596 // We need to do this dance if either (a) We have Bimport or (b) A's colMap is not the same as B's rowMap
3597 if(!Bview.importMatrix.is_null() ||
3598 (Bview.importMatrix.is_null() &&
3599 (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3600
3601 // We do have a Bimport
3602 // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3603 // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3604 RCP<const KBCRS> Ik_;
3605 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KBCRS>(Bview.importMatrix->getLocalMatrixDevice());
3606 const KBCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3607 KBCRS Iks;
3608 if(Ik!=0) Iks = *Ik;
3609 size_t merge_numrows = Ak.numCols();
3610
3611 // The last entry of this at least, need to be initialized
3612 lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3613
3614 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3615
3616 // Use a Kokkos::parallel_scan to build the rowptr
3617 typedef typename Node::execution_space execution_space;
3618 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3619 Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3620 KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3621 if(final) Mrowptr(i) = update;
3622 // Get the row count
3623 size_t ct=0;
3624 if(Acol2Brow(i)!=LO_INVALID)
3625 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3626 else
3627 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3628 update+=ct;
3629
3630 if(final && i+1==merge_numrows)
3631 Mrowptr(i+1)=update;
3632 });
3633
3634 // Allocate nnz
3635 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3636 const int blocksize = Ak.blockDim();
3637 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3638 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz*blocksize*blocksize);
3639
3640 // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3641 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3642 Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3643 if(Acol2Brow(i)!=LO_INVALID) {
3644 size_t row = Acol2Brow(i);
3645 size_t start = Bk.graph.row_map(row);
3646 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3647 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3648
3649 for (int b=0; b<blocksize*blocksize; ++b) {
3650 const int val_indx = j*blocksize*blocksize + b;
3651 const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3652 Mvalues(val_indx) = Bk.values(b_val_indx);
3653 }
3654 }
3655 }
3656 else {
3657 size_t row = Acol2Irow(i);
3658 size_t start = Iks.graph.row_map(row);
3659 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3660 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3661
3662 for (int b=0; b<blocksize*blocksize; ++b) {
3663 const int val_indx = j*blocksize*blocksize + b;
3664 const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3665 Mvalues(val_indx) = Iks.values(b_val_indx);
3666 }
3667 }
3668 }
3669 });
3670
3671 // Build and return merged KokkosSparse matrix
3672 KBCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind, blocksize);
3673 return newmat;
3674 }
3675 else {
3676 // We don't have a Bimport (the easy case)
3677 return Bk;
3678 }
3679}//end merge_matrices
3680
3681/*********************************************************************************************************/
3682template<typename SC, typename LO, typename GO, typename NO>
3683void AddKernels<SC, LO, GO, NO>::
3684addSorted(
3685 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3686 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3687 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3688 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3689 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3690 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3691 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3692 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3693#if KOKKOSKERNELS_VERSION >= 40299
3694 GO numGlobalCols,
3695#endif
3696 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3697 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3698 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3699{
3700 using Teuchos::TimeMonitor;
3701 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3702 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3703 auto nrows = Arowptrs.extent(0) - 1;
3704 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3705 typename AddKern::KKH handle;
3706 handle.create_spadd_handle(true);
3707 auto addHandle = handle.get_spadd_handle();
3708#ifdef HAVE_TPETRA_MMM_TIMINGS
3709 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3710#endif
3711 KokkosSparse::Experimental::spadd_symbolic
3712 (&handle,
3713#if KOKKOSKERNELS_VERSION >= 40299
3714 nrows, numGlobalCols,
3715#endif
3716 Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3717 //KokkosKernels requires values to be zeroed
3718 Cvals = values_array("C values", addHandle->get_c_nnz());
3719 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3720#ifdef HAVE_TPETRA_MMM_TIMINGS
3721 MM = Teuchos::null;
3722 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3723#endif
3724 KokkosSparse::Experimental::spadd_numeric(&handle,
3725#if KOKKOSKERNELS_VERSION >= 40299
3726 nrows, numGlobalCols,
3727#endif
3728 Arowptrs, Acolinds, Avals, scalarA,
3729 Browptrs, Bcolinds, Bvals, scalarB,
3730 Crowptrs, Ccolinds, Cvals);
3731}
3732
3733template<typename SC, typename LO, typename GO, typename NO>
3734void AddKernels<SC, LO, GO, NO>::
3735addUnsorted(
3736 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3737 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3738 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3739 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3740 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3741 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3742 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3743 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3744#if KOKKOSKERNELS_VERSION >= 40299
3745 GO numGlobalCols,
3746#else
3747 GO /* numGlobalCols */,
3748#endif
3749 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3750 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3751 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3752{
3753 using Teuchos::TimeMonitor;
3754 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3755 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3756 auto nrows = Arowptrs.extent(0) - 1;
3757 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3758 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3759 typename AddKern::KKH handle;
3760 handle.create_spadd_handle(false);
3761 auto addHandle = handle.get_spadd_handle();
3762#ifdef HAVE_TPETRA_MMM_TIMINGS
3763 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3764#endif
3765 KokkosSparse::Experimental::spadd_symbolic
3766 (&handle,
3767#if KOKKOSKERNELS_VERSION >= 40299
3768 nrows, numGlobalCols,
3769#endif
3770 Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3771 //Cvals must be zeroed out
3772 Cvals = values_array("C values", addHandle->get_c_nnz());
3773 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3774#ifdef HAVE_TPETRA_MMM_TIMINGS
3775 MM = Teuchos::null;
3776 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3777#endif
3778 KokkosSparse::Experimental::spadd_numeric(&handle,
3779#if KOKKOSKERNELS_VERSION >= 40299
3780 nrows, numGlobalCols,
3781#endif
3782 Arowptrs, Acolinds, Avals, scalarA,
3783 Browptrs, Bcolinds, Bvals, scalarB,
3784 Crowptrs, Ccolinds, Cvals);
3785}
3786
3787template<typename GO,
3788 typename LocalIndicesType,
3789 typename GlobalIndicesType,
3790 typename ColMapType>
3791struct ConvertLocalToGlobalFunctor
3792{
3793 ConvertLocalToGlobalFunctor(
3794 const LocalIndicesType& colindsOrig_,
3795 const GlobalIndicesType& colindsConverted_,
3796 const ColMapType& colmap_) :
3797 colindsOrig (colindsOrig_),
3798 colindsConverted (colindsConverted_),
3799 colmap (colmap_)
3800 {}
3801 KOKKOS_INLINE_FUNCTION void
3802 operator() (const GO i) const
3803 {
3804 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3805 }
3806 LocalIndicesType colindsOrig;
3807 GlobalIndicesType colindsConverted;
3808 ColMapType colmap;
3809};
3810
3811template<typename SC, typename LO, typename GO, typename NO>
3812void MMdetails::AddKernels<SC, LO, GO, NO>::
3813convertToGlobalAndAdd(
3814 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3815 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3816 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3817 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3818 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3819 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3820 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3821 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3822 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3823{
3824 using Teuchos::TimeMonitor;
3825 //Need to use a different KokkosKernelsHandle type than other versions,
3826 //since the ordinals are now GO
3827 using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3828 typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3829
3830 const values_array Avals = A.values;
3831 const values_array Bvals = B.values;
3832 const col_inds_array Acolinds = A.graph.entries;
3833 const col_inds_array Bcolinds = B.graph.entries;
3834 auto Arowptrs = A.graph.row_map;
3835 auto Browptrs = B.graph.row_map;
3836 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3837 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3838#ifdef HAVE_TPETRA_MMM_TIMINGS
3839 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3840#endif
3841 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3842 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3843 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3844 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3845 KKH_GO handle;
3846 handle.create_spadd_handle(false);
3847 auto addHandle = handle.get_spadd_handle();
3848#ifdef HAVE_TPETRA_MMM_TIMINGS
3849 MM = Teuchos::null;
3850 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3851#endif
3852 auto nrows = Arowptrs.extent(0) - 1;
3853 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3854 KokkosSparse::Experimental::spadd_symbolic
3855 (&handle,
3856#if KOKKOSKERNELS_VERSION >= 40299
3857 nrows, A.numCols(),
3858#endif
3859 Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3860 Cvals = values_array("C values", addHandle->get_c_nnz());
3861 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3862#ifdef HAVE_TPETRA_MMM_TIMINGS
3863 MM = Teuchos::null;
3864 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3865#endif
3866 KokkosSparse::Experimental::spadd_numeric(&handle,
3867#if KOKKOSKERNELS_VERSION >= 40299
3868 nrows, A.numCols(),
3869#endif
3870 Arowptrs, AcolindsConverted, Avals, scalarA,
3871 Browptrs, BcolindsConverted, Bvals, scalarB,
3872 Crowptrs, Ccolinds, Cvals);
3873}
3874
3875
3876} //End namepsace MMdetails
3877
3878} //End namespace Tpetra
3879
3880/*********************************************************************************************************/
3881//
3882// Explicit instantiation macro
3883//
3884// Must be expanded from within the Tpetra namespace!
3885//
3886namespace Tpetra {
3887
3888#define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3889template \
3890 void MatrixMatrix::Multiply( \
3891 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3892 bool transposeA, \
3893 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3894 bool transposeB, \
3895 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3896 bool call_FillComplete_on_result, \
3897 const std::string & label, \
3898 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3899\
3900template \
3901 void MatrixMatrix::Multiply( \
3902 const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& A, \
3903 bool transposeA, \
3904 const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& B, \
3905 bool transposeB, \
3906 Teuchos::RCP<BlockCrsMatrix< SCALAR , LO , GO , NODE > >& C, \
3907 const std::string & label); \
3908\
3909template \
3910 void MatrixMatrix::Jacobi( \
3911 SCALAR omega, \
3912 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3913 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3914 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3915 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3916 bool call_FillComplete_on_result, \
3917 const std::string & label, \
3918 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3919\
3920 template \
3921 void MatrixMatrix::Add( \
3922 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3923 bool transposeA, \
3924 SCALAR scalarA, \
3925 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3926 bool transposeB, \
3927 SCALAR scalarB, \
3928 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3929\
3930 template \
3931 void MatrixMatrix::Add( \
3932 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3933 bool transposeA, \
3934 SCALAR scalarA, \
3935 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3936 bool transposeB, \
3937 SCALAR scalarB, \
3938 const Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3939\
3940 template \
3941 void MatrixMatrix::Add( \
3942 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3943 bool transposeA, \
3944 SCALAR scalarA, \
3945 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3946 SCALAR scalarB ); \
3947\
3948 template \
3949 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3950 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3951 (const SCALAR & alpha, \
3952 const bool transposeA, \
3953 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3954 const SCALAR & beta, \
3955 const bool transposeB, \
3956 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3957 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3958 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3959 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3960\
3961 template \
3962 void \
3963 MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3964 (const SCALAR & alpha, \
3965 const bool transposeA, \
3966 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3967 const SCALAR& beta, \
3968 const bool transposeB, \
3969 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3970 CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3971 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3972 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3973 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3974\
3975 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; \
3976\
3977 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const CrsMatrix<SCALAR, LO, GO, NODE>& M, \
3978 Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3979 CrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3980 Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3981 bool userAssertsThereAreNoRemotes, \
3982 const std::string& label, \
3983 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3984\
3985 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const BlockCrsMatrix<SCALAR, LO, GO, NODE>& M, \
3986 Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3987 BlockCrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3988 Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3989 bool userAssertsThereAreNoRemotes);
3990} //End namespace Tpetra
3991
3992#endif // TPETRA_MATRIXMATRIX_DEF_HPP
Matrix Market file readers and writers for Tpetra objects.
Forward declaration of some Tpetra Matrix Matrix objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declaration and definition of Tpetra::Details::getEntryOnHost.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
Teuchos::RCP< const map_type > colMap
Col map for the original version of the matrix.
Teuchos::RCP< const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > origMatrix
The original matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
void start()
Start the deep_copy counter.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void removeCrsMatrixZeros(CrsMatrixType &matrix, typename Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitudeType const &threshold=Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitude(Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::zero()))
Remove zero entries from a matrix.
size_t global_size_t
Global size_t object.