MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
11#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
12
13#include "Kokkos_UnorderedMap.hpp"
14#include "Xpetra_CrsGraphFactory.hpp"
15
17
18#include "MueLu_Aggregates.hpp"
19#include "MueLu_AmalgamationInfo.hpp"
20
21#include "MueLu_MasterList.hpp"
22#include "MueLu_PerfUtils.hpp"
23#include "MueLu_Monitor.hpp"
24
25#include "Xpetra_IO.hpp"
26
27namespace MueLu {
28
29namespace { // anonymous
30
31template <class LocalOrdinal, class View>
32class ReduceMaxFunctor {
33 public:
34 ReduceMaxFunctor(View view)
35 : view_(view) {}
36
37 KOKKOS_INLINE_FUNCTION
38 void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const {
39 if (vmax < view_(i))
40 vmax = view_(i);
41 }
42
43 KOKKOS_INLINE_FUNCTION
44 void join(LocalOrdinal& dst, const LocalOrdinal& src) const {
45 if (dst < src) {
46 dst = src;
47 }
48 }
49
50 KOKKOS_INLINE_FUNCTION
51 void init(LocalOrdinal& dst) const {
52 dst = 0;
53 }
54
55 private:
57};
58
59// local QR decomposition
60template <class LOType, class GOType, class SCType, class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
61class LocalQRDecompFunctor {
62 private:
63 typedef LOType LO;
64 typedef GOType GO;
65 typedef SCType SC;
66
67 typedef typename DeviceType::execution_space execution_space;
68 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
69 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
70 typedef typename impl_ATS::magnitudeType Magnitude;
71
72 typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
73 typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
74
75 private:
76 NspType fineNS;
77 NspType coarseNS;
78 aggRowsType aggRows;
79 maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
80 agg2RowMapLOType agg2RowMapLO;
81 statusType statusAtomic;
82 rowsType rows;
83 rowsAuxType rowsAux;
84 colsAuxType colsAux;
85 valsAuxType valsAux;
87
88 public:
89 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_)
90 : fineNS(fineNS_)
91 , coarseNS(coarseNS_)
92 , aggRows(aggRows_)
93 , maxAggDofSize(maxAggDofSize_)
94 , agg2RowMapLO(agg2RowMapLO_)
95 , statusAtomic(statusAtomic_)
96 , rows(rows_)
97 , rowsAux(rowsAux_)
98 , colsAux(colsAux_)
99 , valsAux(valsAux_)
100 , doQRStep(doQRStep_) {}
101
102 KOKKOS_INLINE_FUNCTION
103 void operator()(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread, size_t& nnz) const {
104 auto agg = thread.league_rank();
105
106 // size of aggregate: number of DOFs in aggregate
107 auto aggSize = aggRows(agg + 1) - aggRows(agg);
108
109 const impl_SC one = impl_ATS::one();
110 const impl_SC two = one + one;
111 const impl_SC zero = impl_ATS::zero();
112 const auto zeroM = impl_ATS::magnitude(zero);
113
114 int m = aggSize;
115 int n = fineNS.extent(1);
116
117 // calculate row offset for coarse nullspace
118 Xpetra::global_size_t offset = agg * n;
119
120 if (doQRStep) {
121 // Extract the piece of the nullspace corresponding to the aggregate
122 shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
123 for (int j = 0; j < n; j++)
124 for (int k = 0; k < m; k++)
125 r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j);
126#if 0
127 printf("A\n");
128 for (int i = 0; i < m; i++) {
129 for (int j = 0; j < n; j++)
130 printf(" %5.3lf ", r(i,j));
131 printf("\n");
132 }
133#endif
134
135 // Calculate QR decomposition (standard)
136 shared_matrix q(thread.team_shmem(), m, m); // Q
137 if (m >= n) {
138 bool isSingular = false;
139
140 // Initialize Q^T
141 auto qt = q;
142 for (int i = 0; i < m; i++) {
143 for (int j = 0; j < m; j++)
144 qt(i, j) = zero;
145 qt(i, i) = one;
146 }
147
148 for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
149 // FIXME_KOKKOS: use team
150 Magnitude s = zeroM, norm, norm_x;
151 for (int i = k + 1; i < m; i++)
152 s += pow(impl_ATS::magnitude(r(i, k)), 2);
153 norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
154
155 if (norm == zero) {
156 isSingular = true;
157 break;
158 }
159
160 r(k, k) -= norm * one;
161
162 norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
163 if (norm_x == zeroM) {
164 // We have a single diagonal element in the column.
165 // No reflections required. Just need to restor r(k,k).
166 r(k, k) = norm * one;
167 continue;
168 }
169
170 // FIXME_KOKKOS: use team
171 for (int i = k; i < m; i++)
172 r(i, k) /= norm_x;
173
174 // Update R(k:m,k+1:n)
175 for (int j = k + 1; j < n; j++) {
176 // FIXME_KOKKOS: use team in the loops
177 impl_SC si = zero;
178 for (int i = k; i < m; i++)
179 si += r(i, k) * r(i, j);
180 for (int i = k; i < m; i++)
181 r(i, j) -= two * si * r(i, k);
182 }
183
184 // Update Q^T (k:m,k:m)
185 for (int j = k; j < m; j++) {
186 // FIXME_KOKKOS: use team in the loops
187 impl_SC si = zero;
188 for (int i = k; i < m; i++)
189 si += r(i, k) * qt(i, j);
190 for (int i = k; i < m; i++)
191 qt(i, j) -= two * si * r(i, k);
192 }
193
194 // Fix R(k:m,k)
195 r(k, k) = norm * one;
196 for (int i = k + 1; i < m; i++)
197 r(i, k) = zero;
198 }
199
200#if 0
201 // Q = (Q^T)^T
202 for (int i = 0; i < m; i++)
203 for (int j = 0; j < i; j++) {
204 impl_SC tmp = qt(i,j);
205 qt(i,j) = qt(j,i);
206 qt(j,i) = tmp;
207 }
208#endif
209
210 // Build coarse nullspace using the upper triangular part of R
211 for (int j = 0; j < n; j++)
212 for (int k = 0; k <= j; k++)
213 coarseNS(offset + k, j) = r(k, j);
214
215 if (isSingular) {
216 statusAtomic(1) = true;
217 return;
218 }
219
220 } else {
221 // Special handling for m < n (i.e. single node aggregates in structural mechanics)
222
223 // The local QR decomposition is not possible in the "overconstrained"
224 // case (i.e. number of columns in qr > number of rowsAux), which
225 // corresponds to #DOFs in Aggregate < n. For usual problems this
226 // is only possible for single node aggregates in structural mechanics.
227 // (Similar problems may arise in discontinuous Galerkin problems...)
228 // We bypass the QR decomposition and use an identity block in the
229 // tentative prolongator for the single node aggregate and transfer the
230 // corresponding fine level null space information 1-to-1 to the coarse
231 // level null space part.
232
233 // NOTE: The resulting tentative prolongation operator has
234 // (m*DofsPerNode-n) zero columns leading to a singular
235 // coarse level operator A. To deal with that one has the following
236 // options:
237 // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
238 // false) to add some identity block to the diagonal of the zero rowsAux
239 // in the coarse level operator A, such that standard level smoothers
240 // can be used again.
241 // - Use special (projection-based) level smoothers, which can deal
242 // with singular matrices (very application specific)
243 // - Adapt the code below to avoid zero columns. However, we do not
244 // support a variable number of DOFs per node in MueLu/Xpetra which
245 // makes the implementation really hard.
246 //
247 // FIXME: do we need to check for singularity here somehow? Zero
248 // columns would be easy but linear dependency would require proper QR.
249
250 // R = extended (by adding identity rowsAux) qr
251 for (int j = 0; j < n; j++)
252 for (int k = 0; k < n; k++)
253 if (k < m)
254 coarseNS(offset + k, j) = r(k, j);
255 else
256 coarseNS(offset + k, j) = (k == j ? one : zero);
257
258 // Q = I (rectangular)
259 for (int i = 0; i < m; i++)
260 for (int j = 0; j < n; j++)
261 q(i, j) = (j == i ? one : zero);
262 }
263
264 // Process each row in the local Q factor and fill helper arrays to assemble P
265 for (int j = 0; j < m; j++) {
266 LO localRow = agg2RowMapLO(aggRows(agg) + j);
267 size_t rowStart = rowsAux(localRow);
268 size_t lnnz = 0;
269 for (int k = 0; k < n; k++) {
270 // skip zeros
271 if (q(j, k) != zero) {
272 colsAux(rowStart + lnnz) = offset + k;
273 valsAux(rowStart + lnnz) = q(j, k);
274 lnnz++;
275 }
276 }
277 rows(localRow + 1) = lnnz;
278 nnz += lnnz;
279 }
280
281#if 0
282 printf("R\n");
283 for (int i = 0; i < m; i++) {
284 for (int j = 0; j < n; j++)
285 printf(" %5.3lf ", coarseNS(i,j));
286 printf("\n");
287 }
288
289 printf("Q\n");
290 for (int i = 0; i < aggSize; i++) {
291 for (int j = 0; j < aggSize; j++)
292 printf(" %5.3lf ", q(i,j));
293 printf("\n");
294 }
295#endif
296 } else {
298 // "no-QR" option //
300 // Local Q factor is just the fine nullspace support over the current aggregate.
301 // Local R factor is the identity.
302 // TODO I have not implemented any special handling for aggregates that are too
303 // TODO small to locally support the nullspace, as is done in the standard QR
304 // TODO case above.
305
306 for (int j = 0; j < m; j++) {
307 LO localRow = agg2RowMapLO(aggRows(agg) + j);
308 size_t rowStart = rowsAux(localRow);
309 size_t lnnz = 0;
310 for (int k = 0; k < n; k++) {
311 const impl_SC qr_jk = fineNS(localRow, k);
312 // skip zeros
313 if (qr_jk != zero) {
314 colsAux(rowStart + lnnz) = offset + k;
315 valsAux(rowStart + lnnz) = qr_jk;
316 lnnz++;
317 }
318 }
319 rows(localRow + 1) = lnnz;
320 nnz += lnnz;
321 }
322
323 for (int j = 0; j < n; j++)
324 coarseNS(offset + j, j) = one;
325 }
326 }
327
328 // amount of shared memory
329 size_t team_shmem_size(int /* team_size */) const {
330 if (doQRStep) {
331 int m = maxAggDofSize;
332 int n = fineNS.extent(1);
333 return shared_matrix::shmem_size(m, n) + // r
334 shared_matrix::shmem_size(m, m); // q
335 } else
336 return 0;
337 }
338};
339
340} // namespace
341
342template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
344 RCP<ParameterList> validParamList = rcp(new ParameterList());
345
346#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
347 SET_VALID_ENTRY("tentative: calculate qr");
348 SET_VALID_ENTRY("tentative: build coarse coordinates");
349#undef SET_VALID_ENTRY
350
351 validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
352 validParamList->set<RCP<const FactoryBase>>("Aggregates", Teuchos::null, "Generating factory of the aggregates");
353 validParamList->set<RCP<const FactoryBase>>("Nullspace", Teuchos::null, "Generating factory of the nullspace");
354 validParamList->set<RCP<const FactoryBase>>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
355 validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
356 validParamList->set<RCP<const FactoryBase>>("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
357 validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory of the coordinates");
358 validParamList->set<RCP<const FactoryBase>>("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
359
360 // Make sure we don't recursively validate options for the matrixmatrix kernels
361 ParameterList norecurse;
362 norecurse.disableRecursiveValidation();
363 validParamList->set<ParameterList>("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
364
365 return validParamList;
366}
367
368template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
370 const ParameterList& pL = GetParameterList();
371 // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
372 std::string nspName = "Nullspace";
373 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
374
375 Input(fineLevel, "A");
376 Input(fineLevel, "Aggregates");
377 Input(fineLevel, nspName);
378 Input(fineLevel, "UnAmalgamationInfo");
379 Input(fineLevel, "CoarseMap");
380 if (fineLevel.GetLevelID() == 0 &&
381 fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
382 pL.get<bool>("tentative: build coarse coordinates")) { // and we want coordinates on other levels
383 bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
384 Input(fineLevel, "Coordinates");
385 } else if (bTransferCoordinates_) {
386 Input(fineLevel, "Coordinates");
387 }
388}
389
390template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
392 return BuildP(fineLevel, coarseLevel);
393}
394
395template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
397 FactoryMonitor m(*this, "Build", coarseLevel);
398
399 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
400 typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
401 const ParameterList& pL = GetParameterList();
402 std::string nspName = "Nullspace";
403 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
404
405 auto A = Get<RCP<Matrix>>(fineLevel, "A");
406 auto aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
407 auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel, "UnAmalgamationInfo");
408 auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
409 auto coarseMap = Get<RCP<const Map>>(fineLevel, "CoarseMap");
410 RCP<RealValuedMultiVector> fineCoords;
411 if (bTransferCoordinates_) {
412 fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel, "Coordinates");
413 }
414
415 RCP<Matrix> Ptentative;
416 // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
417 // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
418 if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
419 Ptentative = Teuchos::null;
420 Set(coarseLevel, "P", Ptentative);
421 return;
422 }
423 RCP<MultiVector> coarseNullspace;
424 RCP<RealValuedMultiVector> coarseCoords;
425
426 if (bTransferCoordinates_) {
427 RCP<const Map> coarseCoordMap;
428 using array_type = typename Map::global_indices_array_device_type;
429
430 LO blkSize = 1;
431 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
432 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
433
434 if (blkSize == 1) {
435 // Scalar system
436 // No amalgamation required, we can use the coarseMap
437 coarseCoordMap = coarseMap;
438 } else {
439 // Vector system
440 // NOTE: There could be further optimizations here where we detect contiguous maps and then
441 // create a contiguous amalgamated maps, which bypasses the expense of the getMyGlobalIndicesDevice()
442 // call (which is free for non-contiguous maps, but costs something if the map is contiguous).
443 using range_policy = Kokkos::RangePolicy<typename Node::execution_space>;
444 array_type elementAList = coarseMap->getMyGlobalIndicesDevice();
445 GO indexBase = coarseMap->getIndexBase();
446 auto numElements = elementAList.size() / blkSize;
447 typename array_type::non_const_type elementList_nc("elementList", numElements);
448
449 // Amalgamate the map
450 Kokkos::parallel_for(
451 "Amalgamate Element List", range_policy(0, numElements), KOKKOS_LAMBDA(LO i) {
452 elementList_nc[i] = (elementAList[i * blkSize] - indexBase) / blkSize + indexBase;
453 });
454 array_type elementList = elementList_nc;
455 coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
456 elementList, indexBase, coarseMap->getComm());
457 }
458
459 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
460
461 // Create overlapped fine coordinates to reduce global communication
462 auto uniqueMap = fineCoords->getMap();
463 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
464 if (aggregates->AggregatesCrossProcessors()) {
465 auto nonUniqueMap = aggregates->GetMap();
466 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
467
468 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
469 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
470 }
471
472 // The good new is that his graph has already been constructed for the
473 // TentativePFactory and was cached in Aggregates. So this is a no-op.
474 auto aggGraph = aggregates->GetGraph();
475 auto numAggs = aggGraph.numRows();
476
477 auto fineCoordsView = fineCoords->getDeviceLocalView(Xpetra::Access::ReadOnly);
478 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
479
480 // Fill in coarse coordinates
481 {
482 SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
483
484 const auto dim = fineCoords->getNumVectors();
485
486 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
487 for (size_t j = 0; j < dim; j++) {
488 Kokkos::parallel_for(
489 "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
490 KOKKOS_LAMBDA(const LO i) {
491 // A row in this graph represents all node ids in the aggregate
492 // Therefore, averaging is very easy
493
494 auto aggregate = aggGraph.rowConst(i);
495
496 coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
497 for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
498 sum += fineCoordsRandomView(aggregate(colID), j);
499
500 coarseCoordsView(i, j) = sum / aggregate.length;
501 });
502 }
503 }
504 }
505
506 if (!aggregates->AggregatesCrossProcessors()) {
507 if (Xpetra::Helpers<SC, LO, GO, NO>::isTpetraBlockCrs(A)) {
508 BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
509 coarseLevel.GetLevelID());
510 } else {
511 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
512 }
513 } else
514 BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
515
516 // If available, use striding information of fine level matrix A for range
517 // map and coarseMap as domain map; otherwise use plain range map of
518 // Ptent = plain range map of A for range map and coarseMap as domain map.
519 // NOTE:
520 // The latter is not really safe, since there is no striding information
521 // for the range map. This is not really a problem, since striding
522 // information is always available on the intermedium levels and the
523 // coarsest levels.
524 if (A->IsView("stridedMaps") == true)
525 Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
526 else
527 Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
528
529 if (bTransferCoordinates_) {
530 Set(coarseLevel, "Coordinates", coarseCoords);
531 }
532
533 // FIXME: We should remove the NodeComm on levels past the threshold
534 if (fineLevel.IsAvailable("Node Comm")) {
535 RCP<const Teuchos::Comm<int>> nodeComm = Get<RCP<const Teuchos::Comm<int>>>(fineLevel, "Node Comm");
536 Set<RCP<const Teuchos::Comm<int>>>(coarseLevel, "Node Comm", nodeComm);
537 }
538
539 Set(coarseLevel, "Nullspace", coarseNullspace);
540 Set(coarseLevel, "P", Ptentative);
541
542 if (IsPrint(Statistics2)) {
543 RCP<ParameterList> params = rcp(new ParameterList());
544 params->set("printLoadBalancingInfo", true);
545 GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
546 }
547}
548
549template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
551 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
552 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
553 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
554 RCP<MultiVector>& coarseNullspace, const int levelID) const {
555 auto rowMap = A->getRowMap();
556 auto colMap = A->getColMap();
557
558 const size_t numRows = rowMap->getLocalNumElements();
559 const size_t NSDim = fineNullspace->getNumVectors();
560
561 typedef Kokkos::ArithTraits<SC> ATS;
562 using impl_SC = typename ATS::val_type;
563 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
564 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
565
566 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
567
568 typename Aggregates::local_graph_type aggGraph;
569 {
570 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
571 aggGraph = aggregates->GetGraph();
572 }
573 auto aggRows = aggGraph.row_map;
574 auto aggCols = aggGraph.entries;
575
576 // Aggregates map is based on the amalgamated column map
577 // We can skip global-to-local conversion if LIDs in row map are
578 // same as LIDs in column map
579 bool goodMap;
580 {
581 SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
582 goodMap = isGoodMap(*rowMap, *colMap);
583 }
584 // FIXME_KOKKOS: need to proofread later code for bad maps
585 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
586 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
587 "(i.e. \"matching\" row and column maps)");
588
589 // STEP 1: do unamalgamation
590 // The non-kokkos version uses member functions from the AmalgamationInfo
591 // container class to unamalgamate the data. In contrast, the kokkos
592 // version of TentativePFactory does the unamalgamation here and only uses
593 // the data of the AmalgamationInfo container class
594
595 // Extract information for unamalgamation
596 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
597 GO indexBase;
598 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
599 GO globalOffset = amalgInfo->GlobalOffset();
600
601 // Extract aggregation info (already in Kokkos host views)
602 auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
603 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
604 const size_t numAggregates = aggregates->GetNumAggregates();
605
606 int myPID = aggregates->GetMap()->getComm()->getRank();
607
608 // Create Kokkos::View (on the device) to store the aggreate dof sizes
609 // Later used to get aggregate dof offsets
610 // NOTE: This zeros itself on construction
611 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
612 AggSizeType aggDofSizes;
613
614 if (stridedBlockSize == 1) {
615 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
616
617 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
618 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
619
620 auto sizesConst = aggregates->ComputeAggregateSizes();
621 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
622
623 } else {
624 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
625
626 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
627 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
628
629 auto nodeMap = aggregates->GetMap()->getLocalMap();
630 auto dofMap = colMap->getLocalMap();
631
632 Kokkos::parallel_for(
633 "MueLu:TentativePF:Build:compute_agg_sizes", range_type(0, numAggregates),
634 KOKKOS_LAMBDA(const LO agg) {
635 auto aggRowView = aggGraph.rowConst(agg);
636
637 size_t size = 0;
638 for (LO colID = 0; colID < aggRowView.length; colID++) {
639 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
640
641 for (LO k = 0; k < stridedBlockSize; k++) {
642 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
643
644 if (dofMap.getLocalElement(dofGID) != INVALID)
645 size++;
646 }
647 }
648 aggDofSizes(agg + 1) = size;
649 });
650 }
651
652 // Find maximum dof size for aggregates
653 // Later used to reserve enough scratch space for local QR decompositions
654 LO maxAggSize = 0;
655 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
656 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
657
658 // parallel_scan (exclusive)
659 // The aggDofSizes View then contains the aggregate dof offsets
660 Kokkos::parallel_scan(
661 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
662 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
663 update += aggDofSizes(i);
664 if (final_pass)
665 aggDofSizes(i) = update;
666 });
667
668 // Create Kokkos::View on the device to store mapping
669 // between (local) aggregate id and row map ids (LIDs)
670 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
671 {
672 SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
673
674 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
675 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
676
677 Kokkos::parallel_for(
678 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
679 KOKKOS_LAMBDA(const LO lnode) {
680 if (procWinner(lnode, 0) == myPID) {
681 // No need for atomics, it's one-to-one
682 auto aggID = vertex2AggId(lnode, 0);
683
684 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
685 // FIXME: I think this may be wrong
686 // We unconditionally add the whole block here. When we calculated
687 // aggDofSizes, we did the isLocalElement check. Something's fishy.
688 for (LO k = 0; k < stridedBlockSize; k++)
689 agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
690 }
691 });
692 }
693
694 // STEP 2: prepare local QR decomposition
695 // Reserve memory for tentative prolongation operator
696 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
697
698 // Pull out the nullspace vectors so that we can have random access (on the device)
699 auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
700 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
701
702 size_t nnz = 0; // actual number of nnz
703
704 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
705 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
706 typedef typename local_matrix_type::index_type::non_const_type cols_type;
707 typedef typename local_matrix_type::values_type::non_const_type vals_type;
708
709 // Device View for status (error messages...)
710 typedef Kokkos::View<int[10], DeviceType> status_type;
711 status_type status("status");
712
713 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
715
716 const ParameterList& pL = GetParameterList();
717 const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
718 if (!doQRStep) {
719 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
720 if (NSDim > 1)
721 GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
722 }
723
724 size_t nnzEstimate = numRows * NSDim;
725 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows + 1);
726 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
727 vals_type valsAux("Ptent_aux_vals", nnzEstimate);
728 rows_type rows("Ptent_rows", numRows + 1);
729 {
730 // Stage 0: fill in views.
731 SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
732
733 // The main thing to notice is initialization of vals with INVALID. These
734 // values will later be used to compress the arrays
735 Kokkos::parallel_for(
736 "MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows + 1),
737 KOKKOS_LAMBDA(const LO row) {
738 rowsAux(row) = row * NSDim;
739 });
740 Kokkos::parallel_for(
741 "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
742 KOKKOS_LAMBDA(const LO j) {
743 colsAux(j) = INVALID;
744 });
745 }
746
747 if (NSDim == 1) {
748 // 1D is special, as it is the easiest. We don't even need to the QR,
749 // just normalize an array. Plus, no worries abot small aggregates. In
750 // addition, we do not worry about compression. It is unlikely that
751 // nullspace will have zeros. If it does, a prolongator row would be
752 // zero and we'll get singularity anyway.
753 SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
754
755 // Set up team policy with numAggregates teams and one thread per team.
756 // Each team handles a slice of the data associated with one aggregate
757 // and performs a local QR decomposition (in this case real QR is
758 // unnecessary).
759 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
760
761 if (doQRStep) {
762 Kokkos::parallel_for(
763 "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
764 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
765 auto agg = thread.league_rank();
766
767 // size of the aggregate (number of DOFs in aggregate)
768 LO aggSize = aggRows(agg + 1) - aggRows(agg);
769
770 // Extract the piece of the nullspace corresponding to the aggregate, and
771 // put it in the flat array, "localQR" (in column major format) for the
772 // QR routine. Trivial in 1D.
773 auto norm = impl_ATS::magnitude(zero);
774
775 // Calculate QR by hand
776 // FIXME: shouldn't there be stridedblock here?
777 // FIXME_KOKKOS: shouldn't there be stridedblock here?
778 for (decltype(aggSize) k = 0; k < aggSize; k++) {
779 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
780 norm += dnorm * dnorm;
781 }
782 norm = sqrt(norm);
783
784 if (norm == zero) {
785 // zero column; terminate the execution
786 statusAtomic(1) = true;
787 return;
788 }
789
790 // R = norm
791 coarseNS(agg, 0) = norm;
792
793 // Q = localQR(:,0)/norm
794 for (decltype(aggSize) k = 0; k < aggSize; k++) {
795 LO localRow = agg2RowMapLO(aggRows(agg) + k);
796 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
797
798 rows(localRow + 1) = 1;
799 colsAux(localRow) = agg;
800 valsAux(localRow) = localVal;
801 }
802 });
803
804 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
805 Kokkos::deep_copy(statusHost, status);
806 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
807 if (statusHost(i)) {
808 std::ostringstream oss;
809 oss << "MueLu::TentativePFactory::MakeTentative: ";
810 switch (i) {
811 case 0: oss << "!goodMap is not implemented"; break;
812 case 1: oss << "fine level NS part has a zero column"; break;
813 }
814 throw Exceptions::RuntimeError(oss.str());
815 }
816
817 } else {
818 Kokkos::parallel_for(
819 "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
820 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
821 auto agg = thread.league_rank();
822
823 // size of the aggregate (number of DOFs in aggregate)
824 LO aggSize = aggRows(agg + 1) - aggRows(agg);
825
826 // R = norm
827 coarseNS(agg, 0) = one;
828
829 // Q = localQR(:,0)/norm
830 for (decltype(aggSize) k = 0; k < aggSize; k++) {
831 LO localRow = agg2RowMapLO(aggRows(agg) + k);
832 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
833
834 rows(localRow + 1) = 1;
835 colsAux(localRow) = agg;
836 valsAux(localRow) = localVal;
837 }
838 });
839 }
840
841 Kokkos::parallel_reduce(
842 "MueLu:TentativeP:CountNNZ", range_type(0, numRows + 1),
843 KOKKOS_LAMBDA(const LO i, size_t& nnz_count) {
844 nnz_count += rows(i);
845 },
846 nnz);
847
848 } else { // NSdim > 1
849 // FIXME_KOKKOS: This code branch is completely unoptimized.
850 // Work to do:
851 // - Optimize QR decomposition
852 // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
853 // packing new values in the beginning of each row
854 // We do use auxilary view in this case, so keep a second rows view for
855 // counting nonzeros in rows
856
857 {
858 SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
859 // Set up team policy with numAggregates teams and one thread per team.
860 // Each team handles a slice of the data associated with one aggregate
861 // and performs a local QR decomposition
862 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1); // numAggregates teams a 1 thread
863 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
864 decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
865 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
866 decltype(valsAux)>
867 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
869 Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
870 }
871
872 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
873 Kokkos::deep_copy(statusHost, status);
874 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
875 if (statusHost(i)) {
876 std::ostringstream oss;
877 oss << "MueLu::TentativePFactory::MakeTentative: ";
878 switch (i) {
879 case 0: oss << "!goodMap is not implemented"; break;
880 case 1: oss << "fine level NS part has a zero column"; break;
881 }
882 throw Exceptions::RuntimeError(oss.str());
883 }
884 }
885
886 // Compress the cols and vals by ignoring INVALID column entries that correspond
887 // to 0 in QR.
888
889 // The real cols and vals are constructed using calculated (not estimated) nnz
890 cols_type cols;
891 vals_type vals;
892
893 if (nnz != nnzEstimate) {
894 {
895 // Stage 2: compress the arrays
896 SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
897
898 Kokkos::parallel_scan(
899 "MueLu:TentativePF:Build:compress_rows", range_type(0, numRows + 1),
900 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
901 upd += rows(i);
902 if (final)
903 rows(i) = upd;
904 });
905 }
906
907 {
908 SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
909
910 cols = cols_type("Ptent_cols", nnz);
911 vals = vals_type("Ptent_vals", nnz);
912
913 // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
914 // to the beginning of rows. See CoalesceDropFactory_kokkos for
915 // example.
916 Kokkos::parallel_for(
917 "MueLu:TentativePF:Build:compress_cols_vals", range_type(0, numRows),
918 KOKKOS_LAMBDA(const LO i) {
919 LO rowStart = rows(i);
920
921 size_t lnnz = 0;
922 for (auto j = rowsAux(i); j < rowsAux(i + 1); j++)
923 if (colsAux(j) != INVALID) {
924 cols(rowStart + lnnz) = colsAux(j);
925 vals(rowStart + lnnz) = valsAux(j);
926 lnnz++;
927 }
928 });
929 }
930
931 } else {
932 rows = rowsAux;
933 cols = colsAux;
934 vals = valsAux;
935 }
936
937 GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
938
939 {
940 // Stage 3: construct Xpetra::Matrix
941 SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
942
943 local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
944
945 // Managing labels & constants for ESFC
946 RCP<ParameterList> FCparams;
947 if (pL.isSublist("matrixmatrix: kernel params"))
948 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
949 else
950 FCparams = rcp(new ParameterList);
951
952 // By default, we don't need global constants for TentativeP
953 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
954 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
955
956 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
957 Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
958 }
959}
960
961template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
963 BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
964 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
965 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
966 RCP<MultiVector>& coarseNullspace, const int levelID) const {
967 /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
968 be generalized later, if we ever need to do so:
969 1) Null space dimension === block size of matrix: So no elasticity right now
970 2) QR is not supported: Under assumption #1, this shouldn't cause problems.
971 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
972
973 These assumptions keep our code way simpler and still support the use cases we actually care about.
974 */
975
976 RCP<const Map> rowMap = A->getRowMap();
977 RCP<const Map> rangeMap = A->getRangeMap();
978 RCP<const Map> colMap = A->getColMap();
979 // const size_t numFinePointRows = rangeMap->getLocalNumElements();
980 const size_t numFineBlockRows = rowMap->getLocalNumElements();
981
982 // typedef Teuchos::ScalarTraits<SC> STS;
983 // typedef typename STS::magnitudeType Magnitude;
984 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
985
986 typedef Kokkos::ArithTraits<SC> ATS;
987 using impl_SC = typename ATS::val_type;
988 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
989 const impl_SC one = impl_ATS::one();
990
991 // const GO numAggs = aggregates->GetNumAggregates();
992 const size_t NSDim = fineNullspace->getNumVectors();
993 auto aggSizes = aggregates->ComputeAggregateSizes();
994
995 typename Aggregates::local_graph_type aggGraph;
996 {
997 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
998 aggGraph = aggregates->GetGraph();
999 }
1000 auto aggRows = aggGraph.row_map;
1001 auto aggCols = aggGraph.entries;
1002
1003 // Need to generate the coarse block map
1004 // NOTE: We assume NSDim == block size here
1005 // NOTE: We also assume that coarseMap has contiguous GIDs
1006 // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1007 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1008 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1009 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1010 numCoarseBlockRows,
1011 coarsePointMap->getIndexBase(),
1012 coarsePointMap->getComm());
1013 // Sanity checking
1014 const ParameterList& pL = GetParameterList();
1015 // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1016
1017 // The aggregates use the amalgamated column map, which in this case is what we want
1018
1019 // Aggregates map is based on the amalgamated column map
1020 // We can skip global-to-local conversion if LIDs in row map are
1021 // same as LIDs in column map
1022 bool goodMap = MueLu::Utilities<SC, LO, GO, NO>::MapsAreNested(*rowMap, *colMap);
1023 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
1024 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1025 "(i.e. \"matching\" row and column maps)");
1026
1027 // STEP 1: do unamalgamation
1028 // The non-kokkos version uses member functions from the AmalgamationInfo
1029 // container class to unamalgamate the data. In contrast, the kokkos
1030 // version of TentativePFactory does the unamalgamation here and only uses
1031 // the data of the AmalgamationInfo container class
1032
1033 // Extract information for unamalgamation
1034 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1035 GO indexBase;
1036 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1037 // GO globalOffset = amalgInfo->GlobalOffset();
1038
1039 // Extract aggregation info (already in Kokkos host views)
1040 auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1041 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1042 const size_t numAggregates = aggregates->GetNumAggregates();
1043
1044 int myPID = aggregates->GetMap()->getComm()->getRank();
1045
1046 // Create Kokkos::View (on the device) to store the aggreate dof sizes
1047 // Later used to get aggregate dof offsets
1048 // NOTE: This zeros itself on construction
1049 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1050 AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1051
1052 {
1053 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1054
1055 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1056 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
1057
1058 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1059 }
1060
1061 // Find maximum dof size for aggregates
1062 // Later used to reserve enough scratch space for local QR decompositions
1063 LO maxAggSize = 0;
1064 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1065 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1066
1067 // parallel_scan (exclusive)
1068 // The aggDofSizes View then contains the aggregate dof offsets
1069 Kokkos::parallel_scan(
1070 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
1071 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1072 update += aggDofSizes(i);
1073 if (final_pass)
1074 aggDofSizes(i) = update;
1075 });
1076
1077 // Create Kokkos::View on the device to store mapping
1078 // between (local) aggregate id and row map ids (LIDs)
1079 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1080 {
1081 SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1082
1083 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1084 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1085
1086 Kokkos::parallel_for(
1087 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1088 KOKKOS_LAMBDA(const LO lnode) {
1089 if (procWinner(lnode, 0) == myPID) {
1090 // No need for atomics, it's one-to-one
1091 auto aggID = vertex2AggId(lnode, 0);
1092
1093 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1094 // FIXME: I think this may be wrong
1095 // We unconditionally add the whole block here. When we calculated
1096 // aggDofSizes, we did the isLocalElement check. Something's fishy.
1097 for (LO k = 0; k < stridedBlockSize; k++)
1098 aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1099 }
1100 });
1101 }
1102
1103 // STEP 2: prepare local QR decomposition
1104 // Reserve memory for tentative prolongation operator
1105 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1106
1107 // Pull out the nullspace vectors so that we can have random access (on the device)
1108 auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
1109 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1110
1111 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
1112 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1113 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1114 // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1115
1116 // Device View for status (error messages...)
1117 typedef Kokkos::View<int[10], DeviceType> status_type;
1118 status_type status("status");
1119
1120 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
1122
1123 // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1124 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1125
1126 // BlockCrs requires that we build the (block) graph first, so let's do that...
1127
1128 // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1129 // block non-zero per row in the matrix;
1130 rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1131 cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1132
1133 Kokkos::parallel_for(
1134 "MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1135 KOKKOS_LAMBDA(const LO j) {
1136 ia[j] = j;
1137 ja[j] = INVALID;
1138
1139 if (j == (LO)numFineBlockRows - 1)
1140 ia[numFineBlockRows] = numFineBlockRows;
1141 });
1142
1143 // Fill Graph
1144 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1145 Kokkos::parallel_for(
1146 "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1147 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1148 auto agg = thread.league_rank();
1149 Xpetra::global_size_t offset = agg;
1150
1151 // size of the aggregate (number of DOFs in aggregate)
1152 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1153
1154 for (LO j = 0; j < aggSize; j++) {
1155 // FIXME: Allow for bad maps
1156 const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1157 const size_t rowStart = ia[localRow];
1158 ja[rowStart] = offset;
1159 }
1160 });
1161
1162 // Compress storage (remove all INVALID, which happen when we skip zeros)
1163 // We do that in-place
1164 {
1165 // Stage 2: compress the arrays
1166 SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1167 // Fill i_temp with the correct row starts
1168 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1169 LO nnz = 0;
1170 Kokkos::parallel_scan(
1171 "MueLu:TentativePF:BlockCrs:compress_rows", range_type(0, numFineBlockRows),
1172 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1173 if (final)
1174 i_temp[i] = upd;
1175 for (auto j = ia[i]; j < ia[i + 1]; j++)
1176 if (ja[j] != INVALID)
1177 upd++;
1178 if (final && i == (LO)numFineBlockRows - 1)
1179 i_temp[numFineBlockRows] = upd;
1180 },
1181 nnz);
1182
1183 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1184
1185 Kokkos::parallel_for(
1186 "MueLu:TentativePF:BlockCrs:compress_cols", range_type(0, numFineBlockRows),
1187 KOKKOS_LAMBDA(const LO i) {
1188 size_t rowStart = i_temp[i];
1189 size_t lnnz = 0;
1190 for (auto j = ia[i]; j < ia[i + 1]; j++)
1191 if (ja[j] != INVALID) {
1192 j_temp[rowStart + lnnz] = ja[j];
1193 lnnz++;
1194 }
1195 });
1196
1197 ia = i_temp;
1198 ja = j_temp;
1199 }
1200
1201 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1202
1203 // Managing labels & constants for ESFC
1204 {
1205 RCP<ParameterList> FCparams;
1206 if (pL.isSublist("matrixmatrix: kernel params"))
1207 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1208 else
1209 FCparams = rcp(new ParameterList);
1210 // By default, we don't need global constants for TentativeP
1211 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
1212 std::string levelIDs = toString(levelID);
1213 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs);
1214 RCP<const Export> dummy_e;
1215 RCP<const Import> dummy_i;
1216 BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1217 }
1218
1219 // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1220 // we clear them here
1221 ia = rows_type();
1222 ja = cols_type();
1223
1224 // Now let's make a BlockCrs Matrix
1225 // NOTE: Assumes block size== NSDim
1226 RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1227 RCP<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>> P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>>(P_xpetra);
1228 if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1229 RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1230
1231 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1232 const LO stride = NSDim * NSDim;
1233
1234 Kokkos::parallel_for(
1235 "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1236 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1237 auto agg = thread.league_rank();
1238
1239 // size of the aggregate (number of DOFs in aggregate)
1240 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1241 Xpetra::global_size_t offset = agg * NSDim;
1242
1243 // Q = localQR(:,0)/norm
1244 for (LO j = 0; j < aggSize; j++) {
1245 LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1246 LO rowStart = localBlockRow * stride;
1247 for (LO r = 0; r < (LO)NSDim; r++) {
1248 LO localPointRow = localBlockRow * NSDim + r;
1249 for (LO c = 0; c < (LO)NSDim; c++) {
1250 values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1251 }
1252 }
1253 }
1254
1255 // R = norm
1256 for (LO j = 0; j < (LO)NSDim; j++)
1257 coarseNS(offset + j, j) = one;
1258 });
1259
1260 Ptentative = P_wrap;
1261}
1262
1263template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1265 BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1266 RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1267 RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1268 RCP<MultiVector>& /* coarseNullspace */) const {
1269 throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1270}
1271
1272template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1274 isGoodMap(const Map& rowMap, const Map& colMap) const {
1275 auto rowLocalMap = rowMap.getLocalMap();
1276 auto colLocalMap = colMap.getLocalMap();
1277
1278 const size_t numRows = rowLocalMap.getLocalNumElements();
1279 const size_t numCols = colLocalMap.getLocalNumElements();
1280
1281 if (numCols < numRows)
1282 return false;
1283
1284 size_t numDiff = 0;
1285 Kokkos::parallel_reduce(
1286 "MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1287 KOKKOS_LAMBDA(const LO i, size_t& diff) {
1288 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1289 },
1290 numDiff);
1291
1292 return (numDiff == 0);
1293}
1294
1295} // namespace MueLu
1296
1297#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1298#endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
View
#define SET_VALID_ENTRY(name)
maxAggDofSizeType maxAggDofSize
agg2RowMapLOType agg2RowMapLO
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
typename LWGraph_kokkos::local_graph_type local_graph_type
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
Class that holds all level-specific information.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
int GetLevelID() const
Return level number.
static const NoFactory * get()
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
bool isGoodMap(const Map &rowMap, const Map &colMap) const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line)
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.