Skip to content

Commit

Permalink
Merge pull request #222 from LLNL/bugfix/probinso/advector_factory
Browse files Browse the repository at this point in the history
Adds 2D JAGGED Loop type
  • Loading branch information
adayton1 authored Apr 19, 2023
2 parents 4cb7037 + e79ebe4 commit 6f28ad2
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 20 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ option(ENABLE_HIP "Build HIP support" OFF)
option(ENABLE_PICK "Enable pick and set methods on ManagedArrays" ON)
option(ENABLE_PINNED "Enable pinned memory space" ON)
option(CARE_ENABLE_PINNED_MEMORY_FOR_SCANS "Use pinned memory for scan lengths" ON)
option(CARE_GPU_MEMORY_IS_ACCESSIBLE_ON_CPU "Allows default memory spaces for ZERO_COPY and PAGEABLE to be the GPU memory space" OFF)
# Option to disable implicit conversion between host_device_ptr and raw arrays in CARE.
option(CARE_ENABLE_IMPLICIT_CONVERSIONS "Enable implicit conversions to-from raw pointers" ON)
# CHAI must also be configured with the same settings for implicit conversions.
Expand Down
1 change: 1 addition & 0 deletions src/care/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ set(care_headers
set(care_sources
care.cpp
CHAICallback.cpp
ExecutionSpace.cpp
LoopFuser.cpp
RAJAPlugin.cpp
scan.cpp
Expand Down
11 changes: 11 additions & 0 deletions src/care/DefaultMacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -876,5 +876,16 @@

#define CARE_MANAGED_PTR_UPDATE_KERNEL_END CARE_CHECKED_MANAGED_PTR_UPDATE_KERNEL_END(care_managed_ptr_write_kernel_check) }

////////////////////////////////////////////////////////////////////////////////
///
/// @brief Macros for launching a 2D kernel with fixed y dimension and varying x dimension
/// If GPU is available, executes on the device.
///
////////////////////////////////////////////////////////////////////////////////
#define CARE_LOOP_2D_STREAM_JAGGED(XINDEX, XSTART, XEND, XLENGTHS, YINDEX, YSTART, YLENGTH, FLAT_INDEX) \
launch_2D_jagged(care::gpu{}, XSTART, XEND, XLENGTHS.data(chai::DEFAULT, true), YSTART, YLENGTH, __FILE__, __LINE__, [=] CARE_DEVICE (int XINDEX, int YINDEX)->void {
#define CARE_LOOP_2D_STREAM_JAGGED_END });


#endif // !defined(_CARE_DEFAULT_MACROS_H_)

67 changes: 67 additions & 0 deletions src/care/ExecutionSpace.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
//////////////////////////////////////////////////////////////////////////////////////
// Copyright 2023 Lawrence Livermore National Security, LLC and other CARE developers.
// See the top-level LICENSE file for details.
//
// SPDX-License-Identifier: BSD-3-Clause
//////////////////////////////////////////////////////////////////////////////////////

#include "care/config.h"
#include "care/ExecutionSpace.h"

// Explicit use of CHAI spaces that can be configured to not exist is not portable,
// we define a ZERO_COPY and a PAGEABLE memory space that falls back to something
// that's guaranteed to exist if the optimal solution is not present.

namespace chai {
#if defined(CHAI_ENABLE_PINNED)
chai::ExecutionSpace ZERO_COPY = chai::PINNED;
#elif defined(CHAI_ENABLE_UM)
chai::ExecutionSpace ZERO_COPY = chai::UM;
#elif defined(CARE_GPU_MEMORY_IS_ACCESSIBLE_ON_CPU)
chai::ExecutionSpace ZERO_COPY = chai::GPU;
#else
chai::ExecutionSpace ZERO_COPY = chai::CPU;
#endif

#if defined(CHAI_ENABLE_UM)
chai::ExecutionSpace PAGEABLE = chai::UM;
#elif defined(CARE_GPU_MEMORY_IS_ACCESSIBLE_ON_CPU)
chai::ExecutionSpace PAGEABLE = chai::GPU;
#else
chai::ExecutionSpace PAGEABLE = chai::CPU;
#endif

#if defined(CARE_GPUCC) || CARE_ENABLE_GPU_SIMULATION_MODE
chai::ExecutionSpace DEFAULT = chai::GPU;
#else
chai::ExecutionSpace DEFAULT = chai::CPU;
#endif

}

namespace care {
#if defined(CHAI_ENABLE_PINNED)
care::ExecutionSpace ZERO_COPY = care::PINNED;
#elif defined(CHAI_ENABLE_UM)
care::ExecutionSpace ZERO_COPY = care::UM;
#elif defined(CARE_GPU_MEMORY_IS_ACCESSIBLE_ON_CPU)
care::ExecutionSpace ZERO_COPY = care::GPU;
#else
care::ExecutionSpace ZERO_COPY = care::CPU;
#endif

#if defined(CHAI_ENABLE_UM)
care::ExecutionSpace PAGEABLE = care::UM;
#elif defined(CARE_GPU_MEMORY_IS_ACCESSIBLE_ON_CPU)
care::ExecutionSpace PAGEABLE = care::GPU;
#else
care::ExecutionSpace PAGEABLE = care::CPU;
#endif

#if defined(CARE_GPUCC) || CARE_ENABLE_GPU_SIMULATION_MODE
care::ExecutionSpace DEFAULT = care::GPU;
#else
care::ExecutionSpace DEFAULT = care::CPU;
#endif
}

23 changes: 23 additions & 0 deletions src/care/ExecutionSpace.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,30 @@ namespace care {
PINNED = chai::PINNED,
NUM_EXECUTION_SPACES = chai::NUM_EXECUTION_SPACES
};

// the ZERO_COPY memory space. Typically PINNED memory, but may be a different space depending on
// how CHAI and CARE are configured.
extern CARE_DLL_API care::ExecutionSpace ZERO_COPY;
// the PAGEABLE memory space. Typically UM, but may be a different space depending on how
// CHAI and CARE are configured.
extern CARE_DLL_API care::ExecutionSpace PAGEABLE;
// the DEFAULT memory space. Typically GPU for GPU platforms and CPU for CPU platforms, but may be a different space depending on how
// CHAI and CARE are configured.
extern CARE_DLL_API care::ExecutionSpace DEFAULT;
} // namespace care
namespace chai {

// the ZERO_COPY memory space. Typically PINNED memory, but may be a different space depending on
// how CHAI and CARE are configured.
extern chai::ExecutionSpace ZERO_COPY;
// the PAGEABLE memory space. Typically UM, but may be a different space depending on how
// CHAI and CARE are configured.
extern chai::ExecutionSpace PAGEABLE;
// the DEFAULT memory space. Typically GPU for GPU platforms and CPU for CPU platforms, but may be a different space depending on how
// CHAI and CARE are configured.
extern chai::ExecutionSpace DEFAULT;

}

#endif // !defined(_CARE_EXECUTION_SPACE_H_)

50 changes: 31 additions & 19 deletions src/care/SortFuser.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef CARE_SORT_FUSER_H
#define CARE_SORT_FUSER_H

#include "care/device_ptr.h"
#include "care/host_ptr.h"
#include "care/host_device_ptr.h"
#include "care/LoopFuser.h"
Expand Down Expand Up @@ -62,13 +63,13 @@ namespace care {
/// @brief uniq's all arrays registered with the Fuser via uniqArray or sortUniqArray
/// @param isSorted - whether all arrays were sorted before the call to uniq.
///////////////////////////////////////////////////////////////////////////
void uniq(bool isSorted=true);
void uniq(bool isSorted=true, bool realloc=true);

///////////////////////////////////////////////////////////////////////////
/// @author Peter Robinson
/// @brief sorts and uniq's all arrays registered with the Fuser via uniqArray or sortUniqArray
///////////////////////////////////////////////////////////////////////////
void sortUniq() { uniq(false);}
void sortUniq(bool realloc=true) { uniq(false, realloc);}

///////////////////////////////////////////////////////////////////////////
/// @author Peter Robinson
Expand Down Expand Up @@ -258,7 +259,7 @@ namespace care {
/// perform a fused uniq, sorting if necessary.
///
template <typename T>
void SortFuser<T>::uniq(bool isSorted) {
void SortFuser<T>::uniq(bool isSorted, bool realloc) {
assemble();
host_device_ptr<T> concatenated_out;
if (!isSorted) {
Expand All @@ -284,29 +285,38 @@ namespace care {
}
}
} CARE_STREAM_LOOP_END

host_ptr<const int> host_out_offsets = out_offsets;
host_device_ptr<int> concatenated_lengths(m_num_arrays, "concatenated_lengths");
host_device_ptr<T> result = concatenated_out;
FUSIBLE_LOOPS_START
for (int a = 0; a < m_num_arrays; ++a) {

// set up a 2D kernel, put per-array meta-data in pinned memory to eliminate cudaMemcpy's of the smaller dimension of data
host_device_ptr<int> lengths(chai::ManagedArray<int>(m_num_arrays, chai::ZERO_COPY));
host_device_ptr<host_device_ptr<int> > out_arrays(chai::ManagedArray<host_device_ptr<int>>(m_num_arrays, chai::ZERO_COPY));
host_ptr<int> pinned_lengths = lengths.getPointer(care::ZERO_COPY, false);
host_ptr<host_device_ptr<int>> pinned_out_arrays = out_arrays.getPointer(care::ZERO_COPY, false);
// initialized lengths, maxLength, and array of arrays for the 2D kernel
int maxLength = 0;
for (int a = 0; a < m_num_arrays; ++a ) {
// update output length by doing subtraction of the offsets
int & len = *m_out_lengths[a];
len = host_out_offsets[a+1]-host_out_offsets[a];
// grow / shrink array to appropriate length
host_device_ptr<T> &array = *m_out_arrays[a];
array.realloc(len);
int offset = host_out_offsets[a];
// scatter results into output arrays
FUSIBLE_LOOP_STREAM(i,0,len) {
result[i+offset] -= max_range*a;
array[i] = result[i+offset];
if (i == 0) {
concatenated_lengths[a] = len;
}
} FUSIBLE_LOOP_STREAM_END
pinned_lengths[a]= len;
maxLength = care::max(len, maxLength);
if (realloc) {
m_out_arrays[a]->realloc(len);
}
pinned_out_arrays[a] = *m_out_arrays[a];
}
FUSIBLE_LOOPS_STOP
// subtract out the offset, copy the result into individual arrays
// (use of device pointer is to avoid clang-query rules that prevent capture of raw pointer)
device_ptr<int> dev_pinned_lengths = lengths.getPointer(ZERO_COPY, false);
CARE_LOOP_2D_STREAM_JAGGED(i, 0, maxLength, lengths, a, 0, m_num_arrays, iFlattened) {
result[i+out_offsets[a]] -= max_range*a;
out_arrays[a][i] = result[i+out_offsets[a]];
if (i == 0) {
concatenated_lengths[a] = dev_pinned_lengths[a];
}
} CARE_LOOP_2D_STREAM_JAGGED_END

// m_concatenated_result contains result of the initial contcatenation, need to swap it
// out with the result of the uniq
Expand All @@ -316,6 +326,8 @@ namespace care {

out_offsets.free();

lengths.free();
out_arrays.free();
}
}

Expand Down
1 change: 1 addition & 0 deletions src/care/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#cmakedefine CARE_ENABLE_FUSER_BIN_32
#cmakedefine01 CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
#cmakedefine01 CARE_ENABLE_PINNED_MEMORY_FOR_SCANS
#cmakedefine CARE_GPU_MEMORY_IS_ACCESSIBLE_ON_CPU
#cmakedefine CARE_ENABLE_STALE_DATA_CHECK
#cmakedefine CARE_ENABLE_RACE_DETECTION

Expand Down
2 changes: 1 addition & 1 deletion src/care/device_ptr.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ namespace care {
///
/// Copy constructor
///
CARE_HOST_DEVICE device_ptr(device_ptr const &ptr) noexcept : m_ptr(ptr) {}
CARE_HOST_DEVICE device_ptr(device_ptr const &ptr) noexcept : m_ptr(ptr.m_ptr) {}

///
/// @author Peter Robinson
Expand Down
86 changes: 86 additions & 0 deletions src/care/forall.h
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,92 @@ namespace care {
break;
}
}

////////////////////////////////////////////////////////////////////////////////
///
/// @author Peter Robinson
///
/// @brief Loops over a 2 dimensional index space with varying lengths in the second dimension.
///
/// @arg[in] policy Compile time execution space to select the backend to execute on.
/// @arg[in] xstart X dimension starting index (inclusive)
/// @arg[in] xend X dimension upper bound of ending index (exclusive)
/// @arg[in] host_lengths ending index in x dimension at each y index from ystart (inclusive) to ylength (exclusive). Raw pointer should be in an appropriate memory
/// space for the Exec type
/// @arg[in] ystart The starting index in the y dimension (inclusive)
/// @arg[in] ylength The ending index in the y dimension (exclusive)
/// @arg[in] fileName The name of the file where this function is called
/// @arg[in] lineNumber The line number in the file where this function is called
/// @arg[in] body The loop body to execute at each (x,y) index
///
////////////////////////////////////////////////////////////////////////////////
template <typename LB, typename Exec>
void launch_2D_jagged(Exec /*policy*/, int xstart, int /*xend*/, int const * host_lengths, int ystart, int ylength, const char * fileName, int lineNumber, LB && body) {
care::RAJAPlugin::pre_forall_hook(chai::CPU, fileName, lineNumber);
// intentional trigger of copy constructor for CHAI correctness
LB body_to_call{body};
for (int y = ystart; y < ylength; ++y) {
for (int x = xstart ; x < host_lengths[y]; ++x) {
body_to_call(x, y);
}
}
care::RAJAPlugin::post_forall_hook(chai::CPU, fileName, lineNumber);
}

#ifdef CARE_GPUCC
////////////////////////////////////////////////////////////////////////////////
///
/// @author Peter Robinson
///
/// @brief the GPU kernel to call from a care::gpu specialization of launch_2D_jagged
///
/// @arg[in] loopBody The loop body to execute at each (x,y) index
/// @arg[in] lengths ending index in x dimension at each y index from ystart (inclusive) to ylength (exclusive). Raw pointer should be in an appropriate memory
/// space for executing on the GPU, recommend PINNED memory so long as bulk of data is in the x dimension (that is sum(lengths) >> ylength)
/// @arg[in] ylength The ending index in the y dimension (exclusive)
///
////////////////////////////////////////////////////////////////////////////////
template <typename LB>
CARE_GLOBAL void care_kernel_2D(LB loopBody, int const * lengths, int ylength) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < lengths[y] && y < ylength) {
loopBody(x,y);
}
}
////////////////////////////////////////////////////////////////////////////////
///
/// @author Peter Robinson
///
/// @brief Loops over a 2 dimensional index space with varying lengths in the second dimension.
///
/// @arg[in] policy Compile time execution space to select the backend to execute on.
/// @arg[in] xstart X dimension starting index (inclusive)
/// @arg[in] xend X dimension upper bound of ending index (exclusive)
/// @arg[in] host_lengths ending index in x dimension at each y index from ystart (inclusive) to ylength (exclusive). Raw pointer should be in an appropriate memory
/// space for the Exec type
/// @arg[in] ystart The starting index in the y dimension (inclusive)
/// @arg[in] ylength The ending index in the y dimension (exclusive)
/// @arg[in] fileName The name of the file where this function is called
/// @arg[in] lineNumber The line number in the file where this function is called
/// @arg[in] body The loop body to execute at each (x,y) index
///
////////////////////////////////////////////////////////////////////////////////
template <typename LB>
void launch_2D_jagged(care::gpu, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber, LB && body) {
if (xend > 0 && ylength > 0) {
// TODO launch this kernel in the camp or RAJA default stream - not sure how to do this - for now this is a synchronous call on the CUDA/HIP default stream
care::RAJAPlugin::pre_forall_hook(chai::GPU, fileName, lineNumber);
dim3 dimBlock(CARE_CUDA_BLOCK_SIZE, 1);
dim3 dimGrid;
dimGrid.x = (xend/CARE_CUDA_BLOCK_SIZE)+(xend%CARE_CUDA_BLOCK_SIZE==0?0:1);
dimGrid.y = ylength;
care_kernel_2D<<<dimGrid, dimBlock>>>( body, gpu_lengths, ylength);
care::RAJAPlugin::post_forall_hook(chai::GPU, fileName, lineNumber);
}
}
#endif

} // namespace care

#endif // !defined(_CARE_FORALL_H_)
Expand Down

0 comments on commit 6f28ad2

Please sign in to comment.