diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst index 6d4c8695d9..6b24ad98b0 100644 --- a/docs/sphinx/user_guide/feature/reduction.rst +++ b/docs/sphinx/user_guide/feature/reduction.rst @@ -190,6 +190,9 @@ RAJA::expt::Reduce .................. :: + using VALOP_DOUBLE_SUM = RAJA::expt::ValOp; + using VALOP_DOUBLE_MIN = RAJA::expt::ValOp; + double* a = ...; double rs = 0.0; @@ -198,9 +201,9 @@ RAJA::expt::Reduce RAJA::forall ( Res, Seg, RAJA::expt::Reduce(&rs), RAJA::expt::Reduce(&rm), - [=] (int i, double& _rs, double& _rm) { + [=] (int i, VALOP_DOUBLE_SUM& _rs, VALOP_DOUBLE_MIN& _rm) { _rs += a[i]; - _rm = RAJA_MIN(a[i], _rm); + _rm.min(a[i]); } ); @@ -213,13 +216,14 @@ RAJA::expt::Reduce above. The reduction operation will include the existing value of the given target variable. * The kernel body lambda expression passed to ``RAJA::forall`` must have a - parameter corresponding to each ``RAJA::expt::Reduce`` argument, ``_rs`` and - ``_rm`` in the example code. These parameters refer to a local target for each - reduction operation. It is important to note that the parameters follow the - kernel iteration variable, ``i`` in this case, and appear in the same order - as the corresponding ``RAJA::expt::Reduce`` arguments to ``RAJA::forall``. The - parameter types must be references to the types used in the - ``RAJA::expt::Reduce`` arguments. + ``RAJA::expt::ValOp`` parameter corresponding to each ``RAJA::expt::Reduce`` + argument, ``_rs`` and ``_rm`` in the example code. These parameters refer to a + local target for each reduction operation. Each ``ValOp`` needs to be templated + on the underlying data type (``double`` for ``_rs`` and ``_rm``), and the operator + being used. It is important to note that the parameters follow the kernel iteration + variable, ``i`` in this case, and appear in the same order as the corresponding + ``RAJA::expt::Reduce`` arguments to ``RAJA::forall``. The ``ValOp`` parameters must + be references to the objects instantiated by the ``RAJA::expt::Reduce`` arguments. * The local variables referred to by ``_rs`` and ``_rm`` are initialized with the *identity* of the reduction operation to be performed. * The local variables are updated in the user supplied lambda. @@ -236,10 +240,19 @@ RAJA::expt::Reduce compatible with the ``EXEC_POL``. ``Seg`` is the iteration space object for ``RAJA::forall``. -.. important:: The order and types of the local reduction variables in the - kernel body lambda expression must match exactly with the - corresponding ``RAJA::expt::Reduce`` arguments to the - ``RAJA::forall`` to ensure that the correct result is obtained. +.. important:: * ``RAJA::expt::Reduce`` arguments must be passed to the forall. + These arguments are templated on the reduction operator, and take + a pointer to the target reduction variable that was declared outside + of the forall. + * The local reduction arguments to the lambda expression must be + ``RAJA::expt::ValOp`` references. Each ``ValOp`` reference + corresponds to a ``RAJA::expt::Reduce`` argument within the forall. + * The ordering of the ``ValOp`` references must correspond to the + ordering of the ``RAJA::expt::Reduce`` arguments to ensure that the + correct result is obtained. + * Each ``ValOp`` reduction data type and RAJA operator need to match + the data type referenced, and operator template argument in the + corresponding ``RAJA::expt::Reduce`` argument. RAJA::expt::ValLoc .................. @@ -247,36 +260,89 @@ RAJA::expt::ValLoc As with the current RAJA reduction interface, the new interface supports *loc* reductions, which provide the ability to get a kernel/loop index at which the final reduction value was found. With this new interface, *loc* reductions -are performed using ``ValLoc`` types. Since they are strongly typed, they -provide ``min()`` and ``max()`` operations that are equivalent to using -``RAJA_MIN()`` or ``RAJA_MAX`` macros as demonstrated in the code example below. -Users must use the ``getVal()`` and ``getLoc()`` methods to access the reduction -results:: +are performed using ``ValLoc`` types, where ``T`` is the underlying data type, +and ``I`` is the index type. Users must use the ``getVal()`` and ``getLoc()`` +methods to access the reduction results after the kernel completes. + +In the lambda expression, a ``ValLoc`` must be wrapped in a +``ValOp`` type, and passed to the lambda in the same order as the corresponding +``RAJA::expt::Reduce`` arguments, e.g. ``ValOp, Op>``. In the example +below, ``VALOPLOC_DOUBLE_MIN`` represents a wrapped ``ValLoc`` usable within the +lambda. + +For convenience, an alias of ``RAJA::expt::ValLocOp`` is provided. +Within the lambda, this ``ValLocOp`` object provides ``minloc``, and ``maxloc`` +functions. In the example below, ``VALOPLOC_DOUBLE_MAX`` represents a wrapped +``ValLoc`` using the ``ValLocOp`` alias:: double* a = ...; + using VALOPLOC_DOUBLE_MIN = RAJA::expt::ValOp, + RAJA::operators::minimum>; + using VALOPLOC_DOUBLE_MAX = RAJA::expt::ValLocOp; + using VL_DOUBLE = RAJA::expt::ValLoc; - VL_DOUBLE rm_loc; + VL_DOUBLE rmin_loc; + VL_DOUBLE rmax_loc; RAJA::forall ( Res, Seg, - RAJA::expt::Reduce(&rm_loc), - [=] (int i, VL_DOUBLE& _rm_loc) { - _rm_loc = RAJA_MIN(VL_DOUBLE(a[i], i), _rm_loc); - //_rm_loc.min(VL_DOUBLE(a[i], i)); // Alternative to RAJA_MIN + RAJA::expt::Reduce(&rmin_loc), + RAJA::expt::Reduce(&rmax_loc), + [=] (int i, VALOPLOC_DOUBLE_MIN& _rmin_loc, VALOPLOC_DOUBLE_MAX& _rmax_loc) { + _rmin_loc.minloc(a[i], i); + _rmax_loc.minloc(a[i], i); } ); - std::cout << rm_loc.getVal() ... - std::cout << rm_loc.getLoc() ... + std::cout << rmin_loc.getVal() ... + std::cout << rmin_loc.getLoc() ... + std::cout << rmax_loc.getVal() ... + std::cout << rmax_loc.getLoc() ... + +Alternatively, *loc* reductions can be performed on separate reduction data, and +location variables without a ``ValLoc`` object, seen in the next example below. +To use this capability, a ``RAJA::expt::ReduceLoc`` argument must be passed to the +``RAJA::forall``, templated on the reduction operation, and passing in references to +the data and location. This is illustrated in the example below, with references to +``rm`` and ``loc`` being passed into the ``ReduceLoc`` argument in the forall. The +data and location can be accessed outside of the forall directly without +``getVal()`` or ``getLoc()`` functions. +:: + + double* a = ...; + + using VALOPLOC_DOUBLE_MIN = RAJA::expt::ValLocOp; + + // No ValLoc needed from the user here. + double rm; + RAJA::Index_type loc; + + RAJA::forall ( Res, Seg, + RAJA::expt::ReduceLoc(&rm, &loc), // --> 1 double & 1 index added + [=] (int i, VALOPLOC_DOUBLE_MIN& _rm_loc) { + _rm_loc.minloc(a[i], i); + } + ); + + // No getVal() or getLoc() required. Access results in their original form. + std::cout << rm ... + std::cout << loc ... + Lambda Arguments ................ This interface takes advantage of C++ parameter packs to allow users to pass -any number of ``RAJA::expt::Reduce`` objects to the ``RAJA::forall`` method:: +any number of ``RAJA::expt::Reduce`` arguments to the ``RAJA::forall`` method:: double* a = ...; + using VALOP_DOUBLE_SUM = RAJA::expt::ValOp; + using VALOP_DOUBLE_MIN = RAJA::expt::ValOp; + using VALOPLOC_DOUBLE_MIN = RAJA::expt::ValLocOp; + using VL_DOUBLE = RAJA::expt::ValLoc; VL_DOUBLE rm_loc; double rs; @@ -287,10 +353,13 @@ any number of ``RAJA::expt::Reduce`` objects to the ``RAJA::forall`` method:: RAJA::expt::Reduce(&rm), // --> 1 double added RAJA::expt::Reduce(&rm_loc), // --> 1 VL_DOUBLE added RAJA::expt::KernelName("MyFirstRAJAKernel"), // --> NO args added - [=] (int i, double& _rs, double& _rm, VL_DOUBLE& _rm_loc) { + [=] (int i, + VALOP_DOUBLE_SUM& _rs, + VALOP_DOUBLE_MIN& _rm, + VALOPLOC_DOUBLE_MIN& _rm_loc) { _rs += a[i]; - _rm = RAJA_MIN(a[i], _rm); - _rm_loc.min(VL_DOUBLE(a[i], i)); + _rm.min(a[i]); + _rm_loc.minloc(a[i], i); } ); @@ -300,11 +369,12 @@ any number of ``RAJA::expt::Reduce`` objects to the ``RAJA::forall`` method:: std::cout << rm_loc.getLoc() ... Again, the lambda expression parameters are in the same order as -the ``RAJA::expt::Reduce`` arguments to ``RAJA::forall``. Both the types and -order of the parameters must match to get correct results and to compile -successfully. Otherwise, a static assertion will be triggered:: +the ``RAJA::expt::Reduce`` arguments to ``RAJA::forall``. The ``ValOp`` underlying +data types and operators, and order of the ``ValOp`` parameters must match +the corresponding ``RAJA::expt::Reduce`` types to get correct results and to +compile successfully. Otherwise, a static assertion will be triggered:: - LAMBDA Not invocable w/ EXPECTED_ARGS. + LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments. .. note:: This static assert is only enabled when passing an undecorated C++ lambda. Meaning, this check will not happen when passing @@ -329,6 +399,9 @@ The usage of the experiemental reductions is similar to the forall example as il double* a = ...; + using VALOP_DOUBLE_SUM = RAJA::expt::ValOp; + using VALOP_DOUBLE_MIN = RAJA::expt::ValOp; + double rs = 0.0; double rm = 1e100; @@ -336,12 +409,12 @@ The usage of the experiemental reductions is similar to the forall example as il RAJA::expt::Reduce(&rs), RAJA::expt::Reduce(&rm), "LaunchReductionKernel", - [=] RAJA_HOST_DEVICE (int i, double& _rs, double& _rm) { + [=] RAJA_HOST_DEVICE (int i, VALOP_DOUBLE_SUM& _rs, VALOP_DOUBLE_MIN& _rm) { RAJA::loop(ctx, Seg, [&] (int i) { _rs += a[i]; - _rm = RAJA_MIN(a[i], _rm); + _rm.min(a[i], _rm); } ); diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp index fb82582704..2305f74c2b 100644 --- a/examples/forall-param-reductions.cpp +++ b/examples/forall-param-reductions.cpp @@ -66,6 +66,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } } +// +// Set a[0] to a different value. Total sum should be 2. +// + a[0] = 3; + // // Set min and max loc values // @@ -80,7 +85,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // Note: with this data initialization scheme, the following results will // be observed for all reduction kernels below: // -// - the sum will be zero +// - the sum will be two // - the min will be -100 // - the max will be 100 // - the min loc will be N/2 @@ -99,7 +104,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // Define ValLoc Type // - using VALLOC_INT = RAJA::expt::ValLoc; + using VALLOC_INT = RAJA::expt::ValLoc; + +// +// Define ValOp Types +// + + using VALOP_INT_SUM = RAJA::expt::ValOp; + using VALOP_INT_MIN = RAJA::expt::ValOp; + using VALOP_INT_MAX = RAJA::expt::ValOp; + using VALOPLOC_INT_MIN = RAJA::expt::ValLocOp; + using VALOPLOC_INT_MAX = RAJA::expt::ValLocOp; + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential reductions...\n"; @@ -113,26 +129,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT seq_minloc(std::numeric_limits::max(), -1); VALLOC_INT seq_maxloc(std::numeric_limits::min(), -1); + int seq_min2 = std::numeric_limits::max(); + int seq_max2 = std::numeric_limits::min(); + RAJA::Index_type seq_minloc2(-1); + RAJA::Index_type seq_maxloc2(-1); + RAJA::forall(host_res, arange, RAJA::expt::Reduce(&seq_sum), RAJA::expt::Reduce(&seq_min), RAJA::expt::Reduce(&seq_max), RAJA::expt::Reduce(&seq_minloc), RAJA::expt::Reduce(&seq_maxloc), + RAJA::expt::ReduceLoc(&seq_min2, &seq_minloc2), + RAJA::expt::ReduceLoc(&seq_max2, &seq_maxloc2), RAJA::expt::KernelName("RAJA Reduce Seq Kernel"), - [=](int i, int &_seq_sum, int &_seq_min, int &_seq_max, VALLOC_INT &_seq_minloc, VALLOC_INT &_seq_maxloc) { + [=](int i, + VALOP_INT_SUM &_seq_sum, + VALOP_INT_MIN &_seq_min, + VALOP_INT_MAX &_seq_max, + VALOPLOC_INT_MIN &_seq_minloc, + VALOPLOC_INT_MAX &_seq_maxloc, + VALOPLOC_INT_MIN &_seq_minloc2, + VALOPLOC_INT_MAX &_seq_maxloc2) { _seq_sum += a[i]; - _seq_min = RAJA_MIN(a[i], _seq_min); - _seq_max = RAJA_MAX(a[i], _seq_max); + _seq_min.min(a[i]); + _seq_max.max(a[i]); + + _seq_minloc.minloc(a[i], i); + _seq_maxloc.maxloc(a[i], i); - _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc); - _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc); - //_seq_minloc.min(a[i], i); - //_seq_maxloc.max(a[i], i); - // Note : RAJA::expt::ValLoc objects provide min() and max() methods - // that are equivalent to the assignments with RAJA_MIN and RAJA_MAX - // above. + _seq_minloc2.minloc(a[i], i); + _seq_maxloc2.maxloc(a[i], i); } ); @@ -143,6 +171,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) << seq_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , " << seq_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << seq_min2 << " , " + << seq_minloc2 << std::endl; + std::cout << "\tmax2, loc2 = " << seq_max2 << " , " + << seq_maxloc2 << std::endl; // _reductions_raja_seq_end @@ -161,23 +193,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT omp_minloc(std::numeric_limits::max(), -1); VALLOC_INT omp_maxloc(std::numeric_limits::min(), -1); + int omp_min2 = std::numeric_limits::max(); + int omp_max2 = std::numeric_limits::min(); + RAJA::Index_type omp_minloc2(-1); + RAJA::Index_type omp_maxloc2(-1); + RAJA::forall(host_res, arange, RAJA::expt::Reduce(&omp_sum), RAJA::expt::Reduce(&omp_min), RAJA::expt::Reduce(&omp_max), RAJA::expt::Reduce(&omp_minloc), RAJA::expt::Reduce(&omp_maxloc), + RAJA::expt::ReduceLoc(&omp_min2, &omp_minloc2), + RAJA::expt::ReduceLoc(&omp_max2, &omp_maxloc2), RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"), - [=](int i, int &_omp_sum, int &_omp_min, int &_omp_max, VALLOC_INT &_omp_minloc, VALLOC_INT &_omp_maxloc) { + [=](int i, + VALOP_INT_SUM &_omp_sum, + VALOP_INT_MIN &_omp_min, + VALOP_INT_MAX &_omp_max, + VALOPLOC_INT_MIN &_omp_minloc, + VALOPLOC_INT_MAX &_omp_maxloc, + VALOPLOC_INT_MIN &_omp_minloc2, + VALOPLOC_INT_MAX &_omp_maxloc2) { _omp_sum += a[i]; - _omp_min = RAJA_MIN(a[i], _omp_min); - _omp_max = RAJA_MAX(a[i], _omp_max); + _omp_min.min(a[i]); + _omp_max.max(a[i]); + + _omp_minloc.minloc(a[i], i); + _omp_maxloc.maxloc(a[i], i); - _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc); - _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc); - //_omp_minloc.min(a[i], i); - //_omp_maxloc.max(a[i], i); + _omp_minloc2.minloc(a[i], i); + _omp_maxloc2.maxloc(a[i], i); } ); @@ -188,6 +235,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) << omp_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , " << omp_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << omp_min2 << " , " + << omp_minloc2 << std::endl; + std::cout << "\tmax2, loc2 = " << omp_max2 << " , " + << omp_maxloc2 << std::endl; #endif @@ -208,23 +259,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT omp_t_minloc(std::numeric_limits::max(), -1); VALLOC_INT omp_t_maxloc(std::numeric_limits::min(), -1); + int omp_t_min2 = std::numeric_limits::max(); + int omp_t_max2 = std::numeric_limits::min(); + RAJA::Index_type omp_t_minloc2(-1); + RAJA::Index_type omp_t_maxloc2(-1); + RAJA::forall(omp_res, arange, RAJA::expt::Reduce(&omp_t_sum), RAJA::expt::Reduce(&omp_t_min), RAJA::expt::Reduce(&omp_t_max), RAJA::expt::Reduce(&omp_t_minloc), RAJA::expt::Reduce(&omp_t_maxloc), + RAJA::expt::ReduceLoc(&omp_t_min2, &omp_t_minloc2), + RAJA::expt::ReduceLoc(&omp_t_max2, &omp_t_maxloc2), RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"), - [=](int i, int &_omp_t_sum, int &_omp_t_min, int &_omp_t_max, VALLOC_INT &_omp_t_minloc, VALLOC_INT &_omp_t_maxloc) { + [=](int i, + VALOP_INT_SUM &_omp_t_sum, + VALOP_INT_MIN &_omp_t_min, + VALOP_INT_MAX &_omp_t_max, + VALOPLOC_INT_MIN &_omp_t_minloc, + VALOPLOC_INT_MAX &_omp_t_maxloc, + VALOPLOC_INT_MIN &_omp_t_minloc2, + VALOPLOC_INT_MAX &_omp_t_maxloc2) { _omp_t_sum += a[i]; - _omp_t_min = RAJA_MIN(a[i], _omp_t_min); - _omp_t_max = RAJA_MAX(a[i], _omp_t_max); + _omp_t_min.min(a[i]); + _omp_t_max.max(a[i]); - _omp_t_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_t_minloc); - _omp_t_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_t_maxloc); - //_omp_t_minloc.min(a[i], i); - //_omp_t_maxloc.max(a[i], i); + _omp_t_minloc.minloc(a[i], i); + _omp_t_maxloc.maxloc(a[i], i); + + _omp_t_minloc2.minloc(a[i], i); + _omp_t_maxloc2.maxloc(a[i], i); } ); @@ -235,6 +301,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) << omp_t_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , " << omp_t_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << omp_t_min2 << " , " + << omp_t_minloc2 << std::endl; + std::cout << "\tmax2, loc2 = " << omp_t_max2 << " , " + << omp_t_maxloc2 << std::endl; #endif @@ -259,23 +329,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT cuda_minloc(std::numeric_limits::max(), -1); VALLOC_INT cuda_maxloc(std::numeric_limits::min(), -1); + int cuda_min2 = std::numeric_limits::max(); + int cuda_max2 = std::numeric_limits::min(); + RAJA::Index_type cuda_minloc2(-1); + RAJA::Index_type cuda_maxloc2(-1); + RAJA::forall(cuda_res, arange, RAJA::expt::Reduce(&cuda_sum), RAJA::expt::Reduce(&cuda_min), RAJA::expt::Reduce(&cuda_max), RAJA::expt::Reduce(&cuda_minloc), RAJA::expt::Reduce(&cuda_maxloc), + RAJA::expt::ReduceLoc(&cuda_min2, &cuda_minloc2), + RAJA::expt::ReduceLoc(&cuda_max2, &cuda_maxloc2), RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"), - [=] RAJA_DEVICE (int i, int &_cuda_sum, int &_cuda_min, int &_cuda_max, VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) { + [=] RAJA_DEVICE ( int i, + VALOP_INT_SUM &_cuda_sum, + VALOP_INT_MIN &_cuda_min, + VALOP_INT_MAX &_cuda_max, + VALOPLOC_INT_MIN &_cuda_minloc, + VALOPLOC_INT_MAX &_cuda_maxloc, + VALOPLOC_INT_MIN &_cuda_minloc2, + VALOPLOC_INT_MAX &_cuda_maxloc2) { _cuda_sum += d_a[i]; - _cuda_min = RAJA_MIN(d_a[i], _cuda_min); - _cuda_max = RAJA_MAX(d_a[i], _cuda_max); + _cuda_min.min(d_a[i]); + _cuda_max.max(d_a[i]); - _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc); - _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc); - //_cuda_minloc.min(a[i], i); - //_cuda_maxloc.max(a[i], i); + _cuda_minloc.minloc(a[i], i); + _cuda_maxloc.maxloc(a[i], i); + + _cuda_minloc2.minloc(a[i], i); + _cuda_maxloc2.maxloc(a[i], i); } ); @@ -286,6 +371,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) << cuda_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " << cuda_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << cuda_min2 << " , " + << cuda_minloc2 << std::endl; + std::cout << "\tmax2, loc2 = " << cuda_max2 << " , " + << cuda_maxloc2 << std::endl; cuda_res.deallocate(d_a); #endif @@ -309,23 +398,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT hip_minloc(std::numeric_limits::max(), -1); VALLOC_INT hip_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(arange, + int hip_min2 = std::numeric_limits::max(); + int hip_max2 = std::numeric_limits::min(); + RAJA::Index_type hip_minloc2(-1); + RAJA::Index_type hip_maxloc2(-1); + + RAJA::forall(hip_res, arange, RAJA::expt::Reduce(&hip_sum), RAJA::expt::Reduce(&hip_min), RAJA::expt::Reduce(&hip_max), RAJA::expt::Reduce(&hip_minloc), RAJA::expt::Reduce(&hip_maxloc), + RAJA::expt::ReduceLoc(&hip_min2, &hip_minloc2), + RAJA::expt::ReduceLoc(&hip_max2, &hip_maxloc2), RAJA::expt::KernelName("RAJA Reduce HIP Kernel"), - [=] RAJA_DEVICE (int i, int &_hip_sum, int &_hip_min, int &_hip_max, VALLOC_INT &_hip_minloc, VALLOC_INT &_hip_maxloc) { + [=] RAJA_DEVICE ( int i, + VALOP_INT_SUM &_hip_sum, + VALOP_INT_MIN &_hip_min, + VALOP_INT_MAX &_hip_max, + VALOPLOC_INT_MIN &_hip_minloc, + VALOPLOC_INT_MAX &_hip_maxloc, + VALOPLOC_INT_MIN &_hip_minloc2, + VALOPLOC_INT_MAX &_hip_maxloc2) { _hip_sum += d_a[i]; - _hip_min = RAJA_MIN(d_a[i], _hip_min); - _hip_max = RAJA_MAX(d_a[i], _hip_max); + _hip_min.min(d_a[i]); + _hip_max.max(d_a[i]); + + _hip_minloc.minloc(d_a[i], i); + _hip_maxloc.maxloc(d_a[i], i); - _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc); - _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc); - //_hip_minloc.min(d_a[i], i); - //_hip_maxloc.max(d_a[i], i); + _hip_minloc2.minloc(d_a[i], i); + _hip_maxloc2.maxloc(d_a[i], i); } ); @@ -336,6 +440,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) << hip_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " << hip_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << hip_min2 << " , " + << hip_minloc2 << std::endl; + std::cout << "\tmax2, loc2 = " << hip_max2 << " , " + << hip_maxloc2 << std::endl; hip_res.deallocate(d_a); #endif @@ -360,23 +468,38 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT sycl_minloc(std::numeric_limits::max(), -1); VALLOC_INT sycl_maxloc(std::numeric_limits::min(), -1); + int sycl_min2 = std::numeric_limits::max(); + int sycl_max2 = std::numeric_limits::min(); + RAJA::Index_type sycl_minloc2(-1); + RAJA::Index_type sycl_maxloc2(-1); + RAJA::forall(sycl_res, arange, RAJA::expt::Reduce(&sycl_sum), RAJA::expt::Reduce(&sycl_min), RAJA::expt::Reduce(&sycl_max), RAJA::expt::Reduce(&sycl_minloc), RAJA::expt::Reduce(&sycl_maxloc), + RAJA::expt::ReduceLoc(&sycl_min2, &sycl_minloc2), + RAJA::expt::ReduceLoc(&sycl_max2, &sycl_maxloc2), RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"), - [=] RAJA_DEVICE (int i, int &_sycl_sum, int &_sycl_min, int &_sycl_max, VALLOC_INT &_sycl_minloc, VALLOC_INT &_sycl_maxloc) { + [=] RAJA_DEVICE ( int i, + VALOP_INT_SUM &_sycl_sum, + VALOP_INT_MIN &_sycl_min, + VALOP_INT_MAX &_sycl_max, + VALOPLOC_INT_MIN &_sycl_minloc, + VALOPLOC_INT_MAX &_sycl_maxloc, + VALOPLOC_INT_MIN &_sycl_minloc2, + VALOPLOC_INT_MAX &_sycl_maxloc2) { _sycl_sum += d_a[i]; - _sycl_min = RAJA_MIN(d_a[i], _sycl_min); - _sycl_max = RAJA_MAX(d_a[i], _sycl_max); + _sycl_min.min(d_a[i]); + _sycl_max.max(d_a[i]); + + _sycl_minloc.minloc(d_a[i], i); + _sycl_maxloc.maxloc(d_a[i], i); - _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc); - _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc); - //_sycl_minloc.min(d_a[i], i); - //_sycl_maxloc.max(d_a[i], i); + _sycl_minloc2.minloc(d_a[i], i); + _sycl_maxloc2.maxloc(d_a[i], i); } ); @@ -387,6 +510,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) << sycl_minloc.getLoc() << std::endl; std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , " << sycl_maxloc.getLoc() << std::endl; + std::cout << "\tmin2, loc2 = " << sycl_min2 << " , " + << sycl_minloc2 << std::endl; + std::cout << "\tmax2, loc2 = " << sycl_max2 << " , " + << sycl_maxloc2 << std::endl; sycl_res.deallocate(d_a); #endif diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp index b57bedfd6b..5bec907c33 100644 --- a/examples/launch-param-reductions.cpp +++ b/examples/launch-param-reductions.cpp @@ -81,6 +81,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) } } +// +// Set a[0] to a different value. Total sum should be 2. +// + a[0] = 3; + // // Set min and max loc values // @@ -95,7 +100,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // Note: with this data initialization scheme, the following results will // be observed for all reduction kernels below: // -// - the sum will be zero +// - the sum will be two // - the min will be -100 // - the max will be 100 // - the min loc will be N/2 @@ -115,6 +120,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // using VALLOC_INT = RAJA::expt::ValLoc; + +// +// Define ValOp Types +// + + using VALOP_INT_SUM = RAJA::expt::ValOp; + using VALOP_INT_MIN = RAJA::expt::ValOp; + using VALOP_INT_MAX = RAJA::expt::ValOp; + using VALOPLOC_INT_MIN = RAJA::expt::ValLocOp; + using VALOPLOC_INT_MAX = RAJA::expt::ValLocOp; + //----------------------------------------------------------------------------// std::cout << "\n Running RAJA sequential reductions...\n"; @@ -129,32 +145,41 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT seq_minloc(std::numeric_limits::max(), -1); VALLOC_INT seq_maxloc(std::numeric_limits::min(), -1); + int seq_min2 = std::numeric_limits::max(); + int seq_max2 = std::numeric_limits::min(); + RAJA::Index_type seq_minloc2(-1); + RAJA::Index_type seq_maxloc2(-1); + RAJA::launch (host_res, RAJA::LaunchParams(), "SeqReductionKernel", - RAJA::expt::Reduce(&seq_sum), + RAJA::expt::Reduce(&seq_sum), RAJA::expt::Reduce(&seq_min), RAJA::expt::Reduce(&seq_max), RAJA::expt::Reduce(&seq_minloc), RAJA::expt::Reduce(&seq_maxloc), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, - int &_seq_sum, int &_seq_min, - int &_seq_max, VALLOC_INT &_seq_minloc, - VALLOC_INT &_seq_maxloc) { + RAJA::expt::ReduceLoc(&seq_min2, &seq_minloc2), + RAJA::expt::ReduceLoc(&seq_max2, &seq_maxloc2), + [=] RAJA_HOST_DEVICE ( RAJA::LaunchContext ctx, + VALOP_INT_SUM &_seq_sum, + VALOP_INT_MIN &_seq_min, + VALOP_INT_MAX &_seq_max, + VALOPLOC_INT_MIN &_seq_minloc, + VALOPLOC_INT_MAX &_seq_maxloc, + VALOPLOC_INT_MIN &_seq_minloc2, + VALOPLOC_INT_MAX &_seq_maxloc2) { RAJA::loop(ctx, arange, [&] (int i) { _seq_sum += a[i]; - _seq_min = RAJA_MIN(a[i], _seq_min); - _seq_max = RAJA_MAX(a[i], _seq_max); + _seq_min.min(a[i]); + _seq_max.max(a[i]); + + _seq_minloc.minloc(a[i], i); + _seq_maxloc.maxloc(a[i], i); - _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc); - _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc); - //_seq_minloc.min(a[i], i); - //_seq_maxloc.max(a[i], i); - // Note : RAJA::expt::ValLoc objects provide min() and max() methods - // that are equivalent to the assignments with RAJA_MIN and RAJA_MAX - // above. + _seq_minloc2.minloc(a[i], i); + _seq_maxloc2.maxloc(a[i], i); } ); @@ -187,29 +212,41 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT omp_minloc(std::numeric_limits::max(), -1); VALLOC_INT omp_maxloc(std::numeric_limits::min(), -1); + int omp_min2 = std::numeric_limits::max(); + int omp_max2 = std::numeric_limits::min(); + RAJA::Index_type omp_minloc2(-1); + RAJA::Index_type omp_maxloc2(-1); + RAJA::launch (host_res, RAJA::LaunchParams(), "OmpReductionKernel", - RAJA::expt::Reduce(&omp_sum), + RAJA::expt::Reduce(&omp_sum), RAJA::expt::Reduce(&omp_min), RAJA::expt::Reduce(&omp_max), RAJA::expt::Reduce(&omp_minloc), RAJA::expt::Reduce(&omp_maxloc), + RAJA::expt::ReduceLoc(&omp_min2, &omp_minloc2), + RAJA::expt::ReduceLoc(&omp_max2, &omp_maxloc2), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, - int &_omp_sum, int &_omp_min, - int &_omp_max, VALLOC_INT &_omp_minloc, - VALLOC_INT &_omp_maxloc) { + VALOP_INT_SUM &_omp_sum, + VALOP_INT_MIN &_omp_min, + VALOP_INT_MAX &_omp_max, + VALOPLOC_INT_MIN &_omp_minloc, + VALOPLOC_INT_MAX &_omp_maxloc, + VALOPLOC_INT_MIN &_omp_minloc2, + VALOPLOC_INT_MAX &_omp_maxloc2) { RAJA::loop(ctx, arange, [&] (int i) { _omp_sum += a[i]; - _omp_min = RAJA_MIN(a[i], _omp_min); - _omp_max = RAJA_MAX(a[i], _omp_max); + _omp_min.min(a[i]); + _omp_max.max(a[i]); + + _omp_minloc.minloc(a[i], i); + _omp_maxloc.maxloc(a[i], i); - _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc); - _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc); - //_omp_minloc.min(a[i], i); - //_omp_maxloc.max(a[i], i); + _omp_minloc2.minloc(a[i], i); + _omp_maxloc2.maxloc(a[i], i); } ); @@ -247,30 +284,43 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT cuda_minloc(std::numeric_limits::max(), -1); VALLOC_INT cuda_maxloc(std::numeric_limits::min(), -1); + int cuda_min2 = std::numeric_limits::max(); + int cuda_max2 = std::numeric_limits::min(); + RAJA::Index_type cuda_minloc2(-1); + RAJA::Index_type cuda_maxloc2(-1); + RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)), "CUDAReductionKernel", - RAJA::expt::Reduce(&cuda_sum), + RAJA::expt::Reduce(&cuda_sum), RAJA::expt::Reduce(&cuda_min), RAJA::expt::Reduce(&cuda_max), RAJA::expt::Reduce(&cuda_minloc), RAJA::expt::Reduce(&cuda_maxloc), + RAJA::expt::ReduceLoc(&cuda_min2, &cuda_minloc2), + RAJA::expt::ReduceLoc(&cuda_max2, &cuda_maxloc2), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, - int &_cuda_sum, int &_cuda_min, int &_cuda_max, - VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) { + VALOP_INT_SUM &_cuda_sum, + VALOP_INT_MIN &_cuda_min, + VALOP_INT_MAX &_cuda_max, + VALOPLOC_INT_MIN &_cuda_minloc, + VALOPLOC_INT_MAX &_cuda_maxloc, + VALOPLOC_INT_MIN &_cuda_minloc2, + VALOPLOC_INT_MAX &_cuda_maxloc2) { RAJA::loop(ctx, arange, [&] (int i) { _cuda_sum += d_a[i]; - _cuda_min = RAJA_MIN(d_a[i], _cuda_min); - _cuda_max = RAJA_MAX(d_a[i], _cuda_max); + _cuda_min.min(d_a[i]); + _cuda_max.max(d_a[i]); + + _cuda_minloc.minloc(a[i], i); + _cuda_maxloc.maxloc(a[i], i); - _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc); - _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc); - //_cuda_minloc.min(a[i], i); - //_cuda_maxloc.max(a[i], i); + _cuda_minloc2.minloc(a[i], i); + _cuda_maxloc2.maxloc(a[i], i); } ); @@ -311,31 +361,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT hip_minloc(std::numeric_limits::max(), -1); VALLOC_INT hip_maxloc(std::numeric_limits::min(), -1); + int hip_min2 = std::numeric_limits::max(); + int hip_max2 = std::numeric_limits::min(); + RAJA::Index_type hip_minloc2(-1); + RAJA::Index_type hip_maxloc2(-1); + RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)), "HipReductionKernel", - RAJA::expt::Reduce(&hip_sum), + RAJA::expt::Reduce(&hip_sum), RAJA::expt::Reduce(&hip_min), RAJA::expt::Reduce(&hip_max), RAJA::expt::Reduce(&hip_minloc), RAJA::expt::Reduce(&hip_maxloc), + RAJA::expt::ReduceLoc(&hip_min2, &hip_minloc2), + RAJA::expt::ReduceLoc(&hip_max2, &hip_maxloc2), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, - int &_hip_sum, int &_hip_min, - int &_hip_max, VALLOC_INT &_hip_minloc, - VALLOC_INT &_hip_maxloc) { + VALOP_INT_SUM &_hip_sum, + VALOP_INT_MIN &_hip_min, + VALOP_INT_MAX &_hip_max, + VALOPLOC_INT_MIN &_hip_minloc, + VALOPLOC_INT_MAX &_hip_maxloc, + VALOPLOC_INT_MIN &_hip_minloc2, + VALOPLOC_INT_MAX &_hip_maxloc2) { RAJA::loop(ctx, arange, [&] (int i) { _hip_sum += d_a[i]; - _hip_min = RAJA_MIN(d_a[i], _hip_min); - _hip_max = RAJA_MAX(d_a[i], _hip_max); + _hip_min.min(d_a[i]); + _hip_max.max(d_a[i]); - _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc); - _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc); - //_hip_minloc.min(d_a[i], i); - //_hip_maxloc.max(d_a[i], i); + _hip_minloc.minloc(d_a[i], i); + _hip_maxloc.maxloc(d_a[i], i); + _hip_minloc2.minloc(d_a[i], i); + _hip_maxloc2.maxloc(d_a[i], i); } ); @@ -374,31 +435,42 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT sycl_minloc(std::numeric_limits::max(), -1); VALLOC_INT sycl_maxloc(std::numeric_limits::min(), -1); + int sycl_min2 = std::numeric_limits::max(); + int sycl_max2 = std::numeric_limits::min(); + RAJA::Index_type sycl_minloc2(-1); + RAJA::Index_type sycl_maxloc2(-1); + RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)), "SyclReductionKernel", - RAJA::expt::Reduce(&sycl_sum), + RAJA::expt::Reduce(&sycl_sum), RAJA::expt::Reduce(&sycl_min), RAJA::expt::Reduce(&sycl_max), RAJA::expt::Reduce(&sycl_minloc), RAJA::expt::Reduce(&sycl_maxloc), + RAJA::expt::ReduceLoc(&sycl_min2, &sycl_minloc2), + RAJA::expt::ReduceLoc(&sycl_max2, &sycl_maxloc2), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, - int &_sycl_sum, int &_sycl_min, - int &_sycl_max, VALLOC_INT &_sycl_minloc, - VALLOC_INT &_sycl_maxloc) { + VALOP_INT_SUM &_sycl_sum, + VALOP_INT_MIN &_sycl_min, + VALOP_INT_MAX &_sycl_max, + VALOPLOC_INT_MIN &_sycl_minloc, + VALOPLOC_INT_MAX &_sycl_maxloc, + VALOPLOC_INT_MIN &_sycl_minloc2, + VALOPLOC_INT_MAX &_sycl_maxloc2) { RAJA::loop(ctx, arange, [&] (int i) { _sycl_sum += d_a[i]; - _sycl_min = RAJA_MIN(d_a[i], _sycl_min); - _sycl_max = RAJA_MAX(d_a[i], _sycl_max); + _sycl_min.min(d_a[i]); + _sycl_max.max(d_a[i]); - _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc); - _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc); - //_sycl_minloc.min(d_a[i], i); - //_sycl_maxloc.max(d_a[i], i); + _sycl_minloc.minloc(d_a[i], i); + _sycl_maxloc.maxloc(d_a[i], i); + _sycl_minloc2.minloc(d_a[i], i); + _sycl_maxloc2.maxloc(d_a[i], i); } ); diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp index fb854c8706..5a656206f5 100644 --- a/include/RAJA/pattern/params/forall.hpp +++ b/include/RAJA/pattern/params/forall.hpp @@ -276,7 +276,7 @@ namespace expt template constexpr concepts::enable_if> check_invocable(LAMBDA&&, const camp::list&) { #if !defined(RAJA_ENABLE_HIP) - static_assert(is_invocable::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS."); + static_assert(is_invocable::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments."); #endif } diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp index 51e96260f8..98380f6ffc 100644 --- a/include/RAJA/pattern/params/params_base.hpp +++ b/include/RAJA/pattern/params/params_base.hpp @@ -6,6 +6,112 @@ namespace RAJA { namespace expt { + + template + struct ValLoc { + using index_type = IndexType; + using value_type = T; + + ValLoc() = default; + RAJA_HOST_DEVICE constexpr explicit ValLoc(value_type v) : val(v) {} + RAJA_HOST_DEVICE constexpr ValLoc(value_type v, index_type l) : val(v), loc(l) {} + + ValLoc(ValLoc const &) = default; + ValLoc(ValLoc &&) = default; + ValLoc& operator=(ValLoc const &) = default; + ValLoc& operator=(ValLoc &&) = default; + + RAJA_HOST_DEVICE constexpr bool operator<(const ValLoc& rhs) const { return val < rhs.val; } + RAJA_HOST_DEVICE constexpr bool operator>(const ValLoc& rhs) const { return val > rhs.val; } + + RAJA_HOST_DEVICE constexpr const value_type& getVal() const {return val;} + RAJA_HOST_DEVICE constexpr const index_type& getLoc() const {return loc;} + + RAJA_HOST_DEVICE void set(T inval, IndexType inindex) {val = inval; loc = inindex;} + RAJA_HOST_DEVICE void setVal(T inval) {val = inval;} + RAJA_HOST_DEVICE void setLoc(IndexType inindex) {loc = inindex;} + + value_type val; + index_type loc = -1; + }; + + template class Op> + struct ValOp { + using value_type = T; + using op_type = Op; + + ValOp() = default; + RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {} + + ValOp(ValOp const &) = default; + ValOp(ValOp &&) = default; + ValOp& operator=(ValOp const &) = default; + ValOp& operator=(ValOp &&) = default; + + template >::value> * = nullptr> + RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; } + template >::value> * = nullptr> + RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; } + + template >::value> * = nullptr> + RAJA_HOST_DEVICE constexpr ValOp & operator+=(const value_type& rhs) { val += rhs; return *this; } + + template >::value> * = nullptr> + RAJA_HOST_DEVICE constexpr ValOp & operator&=(const value_type& rhs) { val &= rhs; return *this; } + + template >::value> * = nullptr> + RAJA_HOST_DEVICE constexpr ValOp & operator|=(const value_type& rhs) { val |= rhs; return *this; } + + template >::value> * = nullptr> + RAJA_HOST_DEVICE ValOp & operator&=(value_type& rhs) { val &= rhs; return *this; } + + template >::value> * = nullptr> + RAJA_HOST_DEVICE ValOp & operator|=(value_type& rhs) { val |= rhs; return *this; } + + RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { val < rhs.val; return *this; } + RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { val > rhs.val; return *this; } + + value_type val = op_type::identity(); + }; + + template class Op> + struct ValOp , Op> { + using index_type = IndexType; + using value_type = ValLoc; + using op_type = Op; + using valloc_value_type = typename value_type::value_type; + using valloc_index_type = typename value_type::index_type; + + ValOp() = default; + RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {} + RAJA_HOST_DEVICE constexpr ValOp(valloc_value_type v, valloc_index_type l) : val(v, l) {} + + ValOp(ValOp const &) = default; + ValOp(ValOp &&) = default; + ValOp& operator=(ValOp const &) = default; + ValOp& operator=(ValOp &&) = default; + + template >::value> * = nullptr> + RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; } + + template >::value> * = nullptr> + RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; } + + template >::value> * = nullptr> + RAJA_HOST_DEVICE constexpr ValOp & minloc(valloc_value_type v, valloc_index_type l) { return min(value_type(v,l)); } + + template >::value> * = nullptr> + RAJA_HOST_DEVICE constexpr ValOp & maxloc(valloc_value_type v, valloc_index_type l) { return max(value_type(v,l)); } + + RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { return val < rhs.val; } + RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { return val > rhs.val; } + + value_type val = op_type::identity(); + }; + + template class Op> + using ValLocOp = ValOp, Op>; + namespace detail { diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp index 05103c7ad4..78b6d7714d 100644 --- a/include/RAJA/pattern/params/reducer.hpp +++ b/include/RAJA/pattern/params/reducer.hpp @@ -15,46 +15,18 @@ namespace RAJA { -namespace expt -{ - -template -struct ValLoc { - using index_type = RAJA::Index_type; - using value_type = T; - - RAJA_HOST_DEVICE ValLoc() {} - RAJA_HOST_DEVICE ValLoc(value_type v) : val(v) {} - RAJA_HOST_DEVICE ValLoc(value_type v, RAJA::Index_type l) : val(v), loc(l) {} - - RAJA_HOST_DEVICE void min(value_type v, index_type l) { if (v < val) { val = v; loc = l; } } - RAJA_HOST_DEVICE void max(value_type v, index_type l) { if (v > val) { val = v; loc = l; } } - - bool constexpr operator<(const ValLoc& rhs) const { return val < rhs.val; } - bool constexpr operator>(const ValLoc& rhs) const { return val > rhs.val; } - - value_type getVal() {return val;} - RAJA::Index_type getLoc() {return loc;} - -private: - value_type val; - index_type loc = -1; -}; - -} // namespace expt - namespace operators { -template -struct limits> { - RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc min() +template +struct limits> { + RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc min() { - return RAJA::expt::ValLoc(RAJA::operators::limits::min()); + return RAJA::expt::ValLoc(RAJA::operators::limits::min()); } - RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc max() + RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc max() { - return RAJA::expt::ValLoc(RAJA::operators::limits::max()); + return RAJA::expt::ValLoc(RAJA::operators::limits::max()); } }; @@ -83,16 +55,41 @@ namespace detail // Basic Reducer // // - template + + // Basic data type Reducer + // T must be a basic data type + // VOp must be ValOp + template struct Reducer : public ForallParamBase { using op = Op; - using value_type = T; + using value_type = T; // This is a basic data type - RAJA_HOST_DEVICE Reducer() {} - Reducer(value_type *target_in) : target(target_in), val(op::identity()) {} + Reducer() = default; + // Basic data type constructor + RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target(target_in){} + + Reducer(Reducer const &) = default; + Reducer(Reducer &&) = default; + Reducer& operator=(Reducer const &) = default; + Reducer& operator=(Reducer &&) = default; + + // Internal ValOp object that is used within RAJA::forall/launch + VOp m_valop = VOp{}; + + // Points to the user specified result variable value_type *target = nullptr; - value_type val = op::identity(); + + // combineTarget() performs the final op on the target data and location in resolve() + RAJA_HOST_DEVICE void combineTarget(value_type in) + { + value_type temp = op{}(*target, in); + *target = temp; + } + + RAJA_HOST_DEVICE + value_type & + getVal() { return m_valop.val; } #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE) // Device related attributes. @@ -101,48 +98,101 @@ namespace detail unsigned int * device_count = nullptr; #endif - using ARG_TUP_T = camp::tuple; - RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&val); } + // These are types and parameters extracted from this struct, and given to the forall. + using ARG_TUP_T = camp::tuple; + RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); } using ARG_LIST_T = typename ARG_TUP_T::TList; static constexpr size_t num_lambda_args = camp::tuple_size::value ; }; -} // namespace detail - -template