diff --git a/examples/main.cpp b/examples/main.cpp index e7ae3e87..4c6e740f 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -700,6 +700,15 @@ int main(int argc, char **argv) CALIPER(CALI_MARK_END("Cycle");) MPI_CALL(MPI_Barrier(MPI_COMM_WORLD)); } + + delete[] workflow; + + // TODO: Add smart-pointers + for (int mat_idx = 0; mat_idx < num_mats; ++mat_idx) { + delete eoses[mat_idx]; + eoses[mat_idx] = nullptr; + } + CALIPER(CALI_MARK_END("TimeStepLoop");); MPI_CALL(MPI_Finalize()); return 0; diff --git a/src/ml/hdcache.hpp b/src/ml/hdcache.hpp index a610fcff..cfaa9548 100644 --- a/src/ml/hdcache.hpp +++ b/src/ml/hdcache.hpp @@ -248,7 +248,15 @@ class HDCache return new_cache; } - ~HDCache() { DBG(Surrogate, "Destroying UQ-cache") } + ~HDCache() + { + DBG(UQModule, "Deleting UQ-Module"); + if (m_index) { + DBG(UQModule, "Deleting HD-Cache"); + m_index->reset(); + delete m_index; + } + } //! ------------------------------------------------------------------------ //! simple queries @@ -400,6 +408,7 @@ class HDCache } else { _evaluate(ndata, data, is_acceptable); } + DBG(UQModule, "Done with evalution of uq") } //! train on data that comes separate features (a vector of pointers) @@ -431,6 +440,7 @@ class HDCache _evaluate(ndata, lin_data, is_acceptable); ams::ResourceManager::deallocate(lin_data, defaultRes); } + DBG(UQModule, "Done with evalution of uq"); } private: @@ -529,28 +539,35 @@ class HDCache for (int start = 0; start < ndata; start += MAGIC_NUMBER) { unsigned int nElems = ((ndata - start) < MAGIC_NUMBER) ? ndata - start : MAGIC_NUMBER; + DBG(UQModule, "Running for %d elements %d %d", nElems, start, m_dim); m_index->search(nElems, - &data[start], + &data[start * m_dim], knbrs, &kdists[start * knbrs], &kidxs[start * knbrs]); } +#ifdef __ENABLE_CUDA__ + faiss::gpu::synchronizeAllDevices(); +#endif // compute means if (defaultRes == AMSResourceType::HOST) { - TypeValue total_dist = 0; for (size_t i = 0; i < ndata; ++i) { CFATAL(UQModule, m_policy == AMSUQPolicy::DeltaUQ, "DeltaUQ is not supported yet"); if (m_policy == AMSUQPolicy::FAISSMean) { - total_dist = - std::accumulate(kdists + i * knbrs, kdists + (i + 1) * knbrs, 0.); - is_acceptable[i] = (ook * total_dist) < acceptable_error; + TypeValue mean_dist = std::accumulate(kdists + i * knbrs, + kdists + (i + 1) * knbrs, + 0.) * + ook; + is_acceptable[i] = mean_dist < acceptable_error; } else if (m_policy == AMSUQPolicy::FAISSMax) { // Take the furtherst cluster as the distance metric - total_dist = kdists[i * knbrs + knbrs - 1]; - is_acceptable[i] = (total_dist) < acceptable_error; + TypeValue max_dist = + *std::max_element(&kdists[i * knbrs], + &kdists[i * knbrs + knbrs - 1]); + is_acceptable[i] = (max_dist) < acceptable_error; } } } else { diff --git a/src/wf/cuda/utilities.cuh b/src/wf/cuda/utilities.cuh index 76818857..7f10f26c 100644 --- a/src/wf/cuda/utilities.cuh +++ b/src/wf/cuda/utilities.cuh @@ -23,13 +23,9 @@ const int warpSize = 32; const unsigned int fullMask = 0xffffffff; -__host__ int divup(int x, int y) { - return (x + y - 1) / y; -} +__host__ int divup(int x, int y) { return (x + y - 1) / y; } -__device__ __inline__ int pow2i(int e) { - return 1 << e; -} +__device__ __inline__ int pow2i(int e) { return 1 << e; } // Define this to turn on error checking #define CUDA_ERROR_CHECK @@ -37,207 +33,247 @@ __device__ __inline__ int pow2i(int e) { #define CUDASAFECALL(err) __cudaSafeCall(err, __FILE__, __LINE__) #define CUDACHECKERROR() __cudaCheckError(__FILE__, __LINE__) -inline void __cudaSafeCall(cudaError err, const char* file, const int line) { +inline void __cudaSafeCall(cudaError err, const char* file, const int line) +{ #ifdef CUDA_ERROR_CHECK - if (cudaSuccess != err) { - fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n", file, line, - cudaGetErrorString(err)); - - fprintf(stdout, "cudaSafeCall() failed at %s:%i : %s\n", file, line, - cudaGetErrorString(err)); - exit(-1); - } + if (cudaSuccess != err) { + fprintf(stderr, + "cudaSafeCall() failed at %s:%i : %s\n", + file, + line, + cudaGetErrorString(err)); + + fprintf(stdout, + "cudaSafeCall() failed at %s:%i : %s\n", + file, + line, + cudaGetErrorString(err)); + exit(-1); + } #endif - return; + return; } struct is_true { - __host__ __device__ bool operator()(const int x) { return x; } + __host__ __device__ bool operator()(const int x) { return x; } }; struct is_false { - __host__ __device__ bool operator()(const int x) { return !x; } + __host__ __device__ bool operator()(const int x) { return !x; } }; -inline void __cudaCheckError(const char* file, const int line) { +inline void __cudaCheckError(const char* file, const int line) +{ #ifdef CUDA_ERROR_CHECK - cudaError err = cudaGetLastError(); - if (cudaSuccess != err) { - fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line, - cudaGetErrorString(err)); - exit(-1); - } - - // More careful checking. However, this will affect performance. - // Comment away if needed. - err = cudaDeviceSynchronize(); - if (cudaSuccess != err) { - fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file, line, - cudaGetErrorString(err)); - exit(-1); - } + cudaError err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, + "cudaCheckError() failed at %s:%i : %s\n", + file, + line, + cudaGetErrorString(err)); + exit(-1); + } + + // More careful checking. However, this will affect performance. + // Comment away if needed. + err = cudaDeviceSynchronize(); + if (cudaSuccess != err) { + fprintf(stderr, + "cudaCheckError() with sync failed at %s:%i : %s\n", + file, + line, + cudaGetErrorString(err)); + exit(-1); + } #endif - return; + return; } -__global__ void srand_dev(curandState* states, const int total_threads) { - int id = threadIdx.x + blockDim.x * blockIdx.x; - if (id < total_threads) { - int seed = id; // different seed per thread - curand_init(seed, id, 0, &states[id]); - } +__global__ void srand_dev(curandState* states, const int total_threads) +{ + int id = threadIdx.x + blockDim.x * blockIdx.x; + if (id < total_threads) { + int seed = id; // different seed per thread + curand_init(seed, id, 0, &states[id]); + } } -__global__ void initIndices(int* ind, int length) { - int id = threadIdx.x + blockDim.x * blockIdx.x; - if (id < length) - ind[id] = id; +__global__ void initIndices(int* ind, int length) +{ + int id = threadIdx.x + blockDim.x * blockIdx.x; + if (id < length) ind[id] = id; } template -__global__ void fillRandom(bool* predicate, const int total_threads, curandState* states, - const size_t length, T threshold) { - int id = threadIdx.x + blockDim.x * blockIdx.x; - if (id < total_threads) { - for (int i = id; i < length; i += total_threads) { - float x = curand_uniform(&states[id]); - predicate[i] = (x <= threshold); - } +__global__ void fillRandom(bool* predicate, + const int total_threads, + curandState* states, + const size_t length, + T threshold) +{ + int id = threadIdx.x + blockDim.x * blockIdx.x; + if (id < total_threads) { + for (int i = id; i < length; i += total_threads) { + float x = curand_uniform(&states[id]); + predicate[i] = (x <= threshold); } + } } template -__global__ void computeBlockCounts(bool cond, T* d_input, int length, int* d_BlockCounts) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < length) { - int pred = ( d_input[idx] == cond ); - int BC = __syncthreads_count(pred); - - if (threadIdx.x == 0) { - d_BlockCounts[blockIdx.x] = - BC; // BC will contain the number of valid elements in all threads of this thread block - } +__global__ void computeBlockCounts(bool cond, + T* d_input, + int length, + int* d_BlockCounts) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < length) { + int pred = (d_input[idx] == cond); + int BC = __syncthreads_count(pred); + + if (threadIdx.x == 0) { + d_BlockCounts[blockIdx.x] = + BC; // BC will contain the number of valid elements in all threads of this thread block } + } } template -__global__ void assignK(T** sparse, T** dense, int* indices, size_t length, int dims, - bool isReverse) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < length) { - int index = indices[idx]; - if (!isReverse) { - for (int i = 0; i < dims; i++) { - dense[i][idx] = sparse[i][index]; - } - } else { - for (int i = 0; i < dims; i++) { - sparse[i][index] = dense[i][idx]; - } - } +__global__ void assignK(T** sparse, + T** dense, + int* indices, + size_t length, + int dims, + bool isReverse) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < length) { + int index = indices[idx]; + if (!isReverse) { + for (int i = 0; i < dims; i++) { + dense[i][idx] = sparse[i][index]; + } + } else { + for (int i = 0; i < dims; i++) { + sparse[i][index] = dense[i][idx]; + } } + } } template -__global__ void compactK(bool cond, T** d_input, +__global__ void compactK(bool cond, + T** d_input, T** d_output, const bool* predicates, const size_t length, int dims, int* d_BlocksOffset, - bool reverse) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ int warpTotals[]; - if (idx < length) { - int pred = (predicates[idx] == cond); - int w_i = threadIdx.x / warpSize; //warp index - int w_l = idx % warpSize; //thread index within a warp - - // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp - int t_m = fullMask >> (warpSize - w_l); //thread mask + bool reverse) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int warpTotals[]; + if (idx < length) { + int pred = (predicates[idx] == cond); + int w_i = threadIdx.x / warpSize; //warp index + int w_l = idx % warpSize; //thread index within a warp + + // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp + int t_m = fullMask >> (warpSize - w_l); //thread mask #if (CUDART_VERSION < 9000) - int b = __ballot(pred) & t_m; //ballot result = number whose ith bit - //is one if the ith's thread pred is true - //masked up to the current index in warp + int b = __ballot(pred) & t_m; //ballot result = number whose ith bit + //is one if the ith's thread pred is true + //masked up to the current index in warp #else - int b = __ballot_sync(fullMask, pred) & t_m; + int b = __ballot_sync(fullMask, pred) & t_m; #endif - int t_u = __popc( - b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX - - // last thread in warp computes total valid counts for the warp - if (w_l == warpSize - 1) { - warpTotals[w_i] = t_u + pred; - } - - // need all warps in thread block to fill in warpTotals before proceeding - __syncthreads(); - - // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block - int numWarps = blockDim.x / warpSize; - unsigned int numWarpsMask = fullMask >> (warpSize - numWarps); - if (w_i == 0 && w_l < numWarps) { - int w_i_u = 0; - for ( int j = 0; j <= 5; j++) { + int t_u = __popc( + b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX + + // last thread in warp computes total valid counts for the warp + if (w_l == warpSize - 1) { + warpTotals[w_i] = t_u + pred; + } + + // need all warps in thread block to fill in warpTotals before proceeding + __syncthreads(); + + // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block + int numWarps = blockDim.x / warpSize; + unsigned int numWarpsMask = fullMask >> (warpSize - numWarps); + if (w_i == 0 && w_l < numWarps) { + int w_i_u = 0; + for (int j = 0; j <= 5; j++) { #if (CUDART_VERSION < 9000) - int b_j = __ballot(warpTotals[w_l] & pow2i(j)); //# of the ones in the j'th digit of the warp offsets + int b_j = __ballot( + warpTotals[w_l] & + pow2i(j)); //# of the ones in the j'th digit of the warp offsets #else - int b_j = __ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j)); + int b_j = __ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j)); #endif - w_i_u += (__popc(b_j & t_m)) << j; - } - warpTotals[w_l] = w_i_u; - } - - // need all warps in thread block to wait until prefix sum is calculated in warpTotals - __syncthreads(); - - // if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid - if (pred) { - if (!reverse) { - for (int i = 0; i < dims; i++) - d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]] = - d_input[i][idx]; - } else { - for (int i = 0; i < dims; i++) - d_input[i][idx] = - d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]]; - } - } + w_i_u += (__popc(b_j & t_m)) << j; + } + warpTotals[w_l] = w_i_u; } -} + // need all warps in thread block to wait until prefix sum is calculated in warpTotals + __syncthreads(); + + // if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid + if (pred) { + if (!reverse) { + for (int i = 0; i < dims; i++) + d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]] = + d_input[i][idx]; + } else { + for (int i = 0; i < dims; i++) + d_input[i][idx] = + d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]]; + } + } + } +} -template -void __global__ linearizeK(TypeOutValue *output, const TypeInValue * const *inputs, size_t dims, size_t elements){ - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if ( idx >= elements ) - return; - for (int i = 0; i < dims; i++ ){ - output[ idx * dims + i] = static_cast(inputs[i][idx]); - } +template +void __global__ linearizeK(TypeOutValue* output, + const TypeInValue* const* inputs, + size_t dims, + size_t elements) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= elements) return; + for (int i = 0; i < dims; i++) { + output[idx * dims + i] = static_cast(inputs[i][idx]); + } } -void __global__ compute_predicate( float *data, bool *predicate, size_t nData, const size_t kneigh, float threshold){ - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if ( idx >= nData ) - return; +void __global__ compute_predicate(float* data, + bool* predicate, + size_t nData, + const size_t kneigh, + float threshold) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= nData) return; - int index = idx * kneigh; - float acc = 0.0f; - for (int i = 0; i < kneigh; i++) - acc += data[index + i]; - acc /= static_cast(kneigh); + int index = idx * kneigh; + float acc = 0.0f; + for (int i = 0; i < kneigh; i++) { + acc += data[index + i]; + } - bool pred = acc < threshold ? true : false; + acc /= static_cast(kneigh); - predicate[idx] = pred; + bool pred = acc < threshold ? true : false; + + predicate[idx] = pred; } template @@ -248,107 +284,176 @@ int compact(bool cond, const size_t length, int dims, int blockSize, - bool isReverse = false) { - int numBlocks = divup(length, blockSize); - int* d_BlocksCount = ams::ResourceManager::allocate(numBlocks, AMSResourceType::DEVICE); - int* d_BlocksOffset = ams::ResourceManager::allocate(numBlocks, AMSResourceType::DEVICE); - // determine number of elements in the compacted list - int* h_BlocksCount = ams::ResourceManager::allocate(numBlocks, AMSResourceType::HOST); - int* h_BlocksOffset = ams::ResourceManager::allocate(numBlocks, AMSResourceType::HOST); - - T** d_dense = ams::ResourceManager::allocate(dims, AMSResourceType::DEVICE); - T** d_sparse = ams::ResourceManager::allocate(dims, AMSResourceType::DEVICE); - - ams::ResourceManager::registerExternal(dense, sizeof(T*) * dims, AMSResourceType::HOST); - ams::ResourceManager::registerExternal(sparse, sizeof(T*) * dims, AMSResourceType::HOST); - ams::ResourceManager::copy(dense, d_dense); - ams::ResourceManager::copy(const_cast (sparse), d_sparse); - thrust::device_ptr thrustPrt_bCount(d_BlocksCount); - thrust::device_ptr thrustPrt_bOffset(d_BlocksOffset); - - //phase 1: count number of valid elements in each thread block - computeBlockCounts<<>>(cond, dPredicate, length, d_BlocksCount); - - //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid - thrust::exclusive_scan(thrust::device, d_BlocksCount, d_BlocksCount + numBlocks, d_BlocksOffset); - - //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements - compactK<<>>( cond, - d_sparse, d_dense, dPredicate, length, dims, d_BlocksOffset, isReverse); - - ams::ResourceManager::copy(d_BlocksCount, h_BlocksCount); - ams::ResourceManager::copy(d_BlocksOffset, h_BlocksOffset); - int compact_length = h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1]; - - ams::ResourceManager::deallocate(d_BlocksCount, AMSResourceType::DEVICE); - ams::ResourceManager::deallocate(d_BlocksOffset, AMSResourceType::DEVICE); - - ams::ResourceManager::deallocate(h_BlocksCount, AMSResourceType::HOST); - ams::ResourceManager::deallocate(h_BlocksOffset, AMSResourceType::HOST); - - ams::ResourceManager::deallocate(d_dense, AMSResourceType::DEVICE); - ams::ResourceManager::deallocate(d_sparse, AMSResourceType::DEVICE); - - ams::ResourceManager::deregisterExternal(dense); - ams::ResourceManager::deregisterExternal(sparse); - cudaDeviceSynchronize(); + bool isReverse = false) +{ + int numBlocks = divup(length, blockSize); + int* d_BlocksCount = + ams::ResourceManager::allocate(numBlocks, AMSResourceType::DEVICE); + int* d_BlocksOffset = + ams::ResourceManager::allocate(numBlocks, AMSResourceType::DEVICE); + // determine number of elements in the compacted list + int* h_BlocksCount = + ams::ResourceManager::allocate(numBlocks, AMSResourceType::HOST); + int* h_BlocksOffset = + ams::ResourceManager::allocate(numBlocks, AMSResourceType::HOST); + + T** d_dense = + ams::ResourceManager::allocate(dims, AMSResourceType::DEVICE); + T** d_sparse = + ams::ResourceManager::allocate(dims, AMSResourceType::DEVICE); + + ams::ResourceManager::registerExternal(dense, + sizeof(T*) * dims, + AMSResourceType::HOST); + ams::ResourceManager::registerExternal(sparse, + sizeof(T*) * dims, + AMSResourceType::HOST); + ams::ResourceManager::copy(dense, d_dense); + ams::ResourceManager::copy(const_cast(sparse), d_sparse); + thrust::device_ptr thrustPrt_bCount(d_BlocksCount); + thrust::device_ptr thrustPrt_bOffset(d_BlocksOffset); + + //phase 1: count number of valid elements in each thread block + computeBlockCounts<<>>(cond, + dPredicate, + length, + d_BlocksCount); + + //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid + thrust::exclusive_scan(thrust::device, + d_BlocksCount, + d_BlocksCount + numBlocks, + d_BlocksOffset); + + //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements + compactK<<>>( + cond, + d_sparse, + d_dense, + dPredicate, + length, + dims, + d_BlocksOffset, + isReverse); + + ams::ResourceManager::copy(d_BlocksCount, h_BlocksCount); + ams::ResourceManager::copy(d_BlocksOffset, h_BlocksOffset); + int compact_length = + h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1]; + + ams::ResourceManager::deallocate(d_BlocksCount, AMSResourceType::DEVICE); + ams::ResourceManager::deallocate(d_BlocksOffset, AMSResourceType::DEVICE); + + ams::ResourceManager::deallocate(h_BlocksCount, AMSResourceType::HOST); + ams::ResourceManager::deallocate(h_BlocksOffset, AMSResourceType::HOST); + + ams::ResourceManager::deallocate(d_dense, AMSResourceType::DEVICE); + ams::ResourceManager::deallocate(d_sparse, AMSResourceType::DEVICE); + + ams::ResourceManager::deregisterExternal(dense); + ams::ResourceManager::deregisterExternal(sparse); + cudaDeviceSynchronize(); - return compact_length; + return compact_length; } template -int compact(bool cond, T** sparse, T** dense, int* indices, const size_t length, int dims, int blockSize, - const bool* dPredicate, bool isReverse = false) { - int numBlocks = divup(length, blockSize); - size_t sparseElements = length; - - if (!isReverse) { - initIndices<<>>(indices, length); - if ( cond ){ - auto last = thrust::copy_if(thrust::device, indices, indices + sparseElements, dPredicate, - indices, is_true()); - sparseElements = last - indices; - } - else{ - auto last = thrust::copy_if(thrust::device, indices, indices + sparseElements, dPredicate, - indices, is_false()); - sparseElements = last - indices; - } +int compact(bool cond, + T** sparse, + T** dense, + int* indices, + const size_t length, + int dims, + int blockSize, + const bool* dPredicate, + bool isReverse = false) +{ + int numBlocks = divup(length, blockSize); + size_t sparseElements = length; + + if (!isReverse) { + initIndices<<>>(indices, length); + if (cond) { + auto last = thrust::copy_if(thrust::device, + indices, + indices + sparseElements, + dPredicate, + indices, + is_true()); + sparseElements = last - indices; + } else { + auto last = thrust::copy_if(thrust::device, + indices, + indices + sparseElements, + dPredicate, + indices, + is_false()); + sparseElements = last - indices; } + } - assignK<<>>(sparse, dense, indices, sparseElements, dims, isReverse); - cudaDeviceSynchronize(); + assignK<<>>( + sparse, dense, indices, sparseElements, dims, isReverse); + cudaDeviceSynchronize(); - return sparseElements; + return sparseElements; } -template -void device_linearize(TypeOutValue *output, const TypeInValue * const *inputs, size_t dims, size_t elements){ -// TODO: Fix "magic number". +template +void device_linearize(TypeOutValue* output, + const TypeInValue* const* inputs, + size_t dims, + size_t elements) +{ + // TODO: Fix "magic number". const int NT = 256; -// TODO: We should add a max number of blocks typically this should be around 3K. - int NB = (elements + NT - 1 ) / NT; + // TODO: We should add a max number of blocks typically this should be around 3K. + int NB = (elements + NT - 1) / NT; + DBG(Device, + "Linearize using %ld blocks %ld threads to transpose %ld, %ld matrix", + NB, + NT, + dims, + elements); + linearizeK<<>>(output, inputs, dims, elements); + cudaDeviceSynchronize(); } template -void cuda_rand_init(bool* predicate, const size_t length, T threshold) { - static curandState* dev_random = NULL; - const int TS = 4096; - const int BS = 128; - int numBlocks = divup(TS, BS); - if (!dev_random) { - dev_random = ams::ResourceManager::allocate(4096); - srand_dev<<>>(dev_random, TS); - } - - fillRandom<<>>(predicate, TS, dev_random, length, threshold); - cudaDeviceSynchronize(); +void cuda_rand_init(bool* predicate, const size_t length, T threshold) +{ + static curandState* dev_random = NULL; + const int TS = 4096; + const int BS = 128; + int numBlocks = divup(TS, BS); + if (!dev_random) { + dev_random = ams::ResourceManager::allocate(4096); + srand_dev<<>>(dev_random, TS); + } + + DBG(Device, + "Random Fill using %ld blocks %ld threads to randomly initialize %ld " + "elements", + numBlocks, + BS, + length); + fillRandom<<>>(predicate, TS, dev_random, length, threshold); + cudaDeviceSynchronize(); } -void device_compute_predicate( float *data, bool *predicate, size_t nData, const size_t kneigh, float threshold){ +void device_compute_predicate(float* data, + bool* predicate, + size_t nData, + const size_t kneigh, + float threshold) +{ const int NT = 256; - int NB = (nData + NT - 1 ) / NT; + int NB = (nData + NT - 1) / NT; + DBG(Device, + "Compute predicate for %d elements with threshold %f", + nData, + threshold); compute_predicate<<>>(data, predicate, nData, kneigh, threshold); cudaDeviceSynchronize(); } diff --git a/src/wf/resource_manager.hpp b/src/wf/resource_manager.hpp index 3d99ddc4..b5db514e 100644 --- a/src/wf/resource_manager.hpp +++ b/src/wf/resource_manager.hpp @@ -141,7 +141,7 @@ PERFFASPECT() static TypeInValue* allocate(size_t nvalues, AMSResourceType dev = default_resource) { static auto& rm = umpire::ResourceManager::getInstance(); - DBG(ResourceManager, "Requesting to allocate %ld values using allocator :%s %d", nvalues, getAllocatorName(dev)); + DBG(ResourceManager, "Requesting to allocate %ld values using allocator :%s", nvalues, getAllocatorName(dev)); auto alloc = rm.getAllocator(allocator_ids[dev]); TypeInValue *ret = static_cast(alloc.allocate(nvalues * sizeof(TypeInValue))); CFATAL(ResourceManager, ret == nullptr, diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 268ba56d..b79a7890 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -24,5 +24,18 @@ endfunction() ADDTEST(ams_allocator ams_allocate.cpp AMSAllocate) ADDTEST(ams_packing cpu_packing_test.cpp AMSPack) +if (WITH_TORCH) ADDTEST(ams_inference_double torch_model.cpp AMSInferDouble ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt "double") ADDTEST(ams_inference_single torch_model.cpp AMSInferSingle ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt "single") +endif() + +if(WITH_FAISS) +ADDTEST(ams_hdcache_mean_double test_hdcache.cpp AMSHDCacheMeanPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 0 10 4.0 4 5) +# The max case fails on DEVICE. We should be aware abou this when adding support for CI for GPUs +ADDTEST(ams_hdcache_max_double test_hdcache.cpp AMSHDCacheMaxPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 1 10 4.0 4 5) + +ADDTEST(ams_hdcache_mean_single test_hdcache.cpp AMSHDCacheMeanPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 0 10 4.0 4 5) +# The max case fails on DEVICE. We should be aware abou this when adding support for CI for GPUs +ADDTEST(ams_hdcache_max_single test_hdcache.cpp AMSHDCacheMaxPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 1 10 4.0 4 5) +endif() + diff --git a/tests/faiss_debug.pt b/tests/faiss_debug.pt new file mode 100644 index 00000000..7ea0c6eb Binary files /dev/null and b/tests/faiss_debug.pt differ diff --git a/tests/generate_faiss.py b/tests/generate_faiss.py new file mode 100644 index 00000000..05d66ff1 --- /dev/null +++ b/tests/generate_faiss.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# Copyright 2021-2023 Lawrence Livermore National Security, LLC and other +# AMSLib Project Developers +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import argparse +import numpy as np +import faiss # make faiss available + + +def create_elements(nb, d, distance=10, offset=0): + centers = [] + for i in range(nb): + c = ((i + 1) * distance) + offset + centers.append(np.random.uniform(low=c - 0.5, high=c + 0.5, size=(100, d))) + + xb = np.vstack(centers).astype("float32") + return xb + + +def main(): + parser = argparse.ArgumentParser( + description="Faiss index generator for test purposes" + ) + parser.add_argument( + "-d", "--dimensions", type=int, help="Dimensions of our vectors", default=4 + ) + parser.add_argument( + "-dst", + "--distance", + type=int, + help="Jump between centers in Faiss index", + default=10, + ) + parser.add_argument( + "-f", + "--file", + type=str, + help="File to store FAISS index", + default="faiss_debug.pt", + ) + parser.add_argument( + "-c", + "--num-centers", + type=int, + help="Number of centers in FAISS index", + default=10, + ) + args = parser.parse_args() + + xb = create_elements(args.num_centers, args.dimensions) + index = faiss.IndexFlatL2(args.dimensions) # build the index + index.add(xb) # add vectors to the index + faiss.write_index(index, args.file) + # print(xb) + + # k=1 + # xq_false = create_elements(args.num_centers, args.dimensions, args.distance, 5) + # xq_true = create_elements(args.num_centers,args.dimensions) + # xq = np.vstack((xq_false, xq_true)).astype('float32') + # print("Search for") + # print(xq) + # D, I = index.search(xq, k) + # print("Distance is", D) + # print("Indexes are", I) + + # predicate = np.any(D < 4, axis=1) + # print(predicate) + + +if __name__ == "__main__": + main() diff --git a/tests/test_hdcache.cpp b/tests/test_hdcache.cpp new file mode 100644 index 00000000..b26de8fe --- /dev/null +++ b/tests/test_hdcache.cpp @@ -0,0 +1,182 @@ +/* + * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other + * AMSLib Project Developers + * + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +template +std::vector generate_vectors(const int num_clusters, + int elements, + int dims) +{ + std::vector v_data; + // This are fixed to mimic the way the faiss was generated + // The code below generates data values that are either within + // the distance of the faiss index or just outside of it. + const T distance = 10.0; + const T offset = 5.0; + for (int i = 0; i < dims; i++) { + T *data = ams::ResourceManager::allocate(num_clusters * elements, + AMSResourceType::HOST); + for (int j = 0; j < elements; j++) { + // Generate a value for every cluster center + for (int k = 0; k < num_clusters; k++) { + T tmp = ((T)rand()) / INT_MAX; + tmp += (k + 1) * num_clusters; + if ((j % 2) == 0) { + tmp += offset; + } + data[j * num_clusters + k] = tmp; + } + } + v_data.push_back(data); + } + return std::move(v_data); +} + +template +void print_vectors(std::vector &vec, int num_elements, int num_clusters) +{ + for (int i = 0; i < num_elements; i++) { + for (int c = 0; c < num_clusters; c++) { + for (auto v : vec) { + std::cout << v[i * num_clusters + c] << ":"; + } + std::cout << "\n"; + } + } +} + + +bool validate(const int num_clusters, const int elements, bool *predicates) +{ + bool res = true; + for (int j = 0; j < elements; j++) { + // Generate a value for every cluster center + for (int k = 0; k < num_clusters; k++) { + if (j % 2 == 0 && predicates[j * num_clusters + k] == true) { + res = false; + } else if (j % 2 == 1 && predicates[j * num_clusters + k] == false) { + res = false; + } + } + } + return res; +} + +template +bool do_faiss(std::shared_ptr> &index, + AMSResourceType resource, + int nClusters, + int nDims, + int nElements, + float threshold) +{ + + std::vector orig_data = + generate_vectors(nClusters, nElements, nDims); + std::vector data = orig_data; + + bool *predicates = + ams::ResourceManager::allocate(nClusters * nElements, resource); + + if (resource == AMSResourceType::DEVICE) { + for (int i = 0; i < orig_data.size(); i++) { + T *d_data = + ams::ResourceManager::allocate(nClusters * nElements, resource); + ams::ResourceManager::copy(const_cast(orig_data[i]), + d_data, + nClusters * nElements * sizeof(T)); + data[i] = d_data; + } + } + + + index->evaluate(nClusters * nElements, data, predicates); + + bool *h_predicates = predicates; + + if (resource == AMSResourceType::DEVICE) { + h_predicates = ams::ResourceManager::allocate(nClusters * nElements, + AMSResourceType::HOST); + ams::ResourceManager::copy(predicates, h_predicates, nClusters * nElements); + for (auto d : data) { + ams::ResourceManager::deallocate(const_cast(d), + AMSResourceType::DEVICE); + } + ams::ResourceManager::deallocate(predicates, AMSResourceType::DEVICE); + } + + + for (auto h_d : orig_data) + ams::ResourceManager::deallocate(const_cast(h_d), + AMSResourceType::HOST); + + bool res = validate(nClusters, nElements, h_predicates); + + ams::ResourceManager::deallocate(h_predicates, AMSResourceType::HOST); + return res; +} + + +int main(int argc, char *argv[]) +{ + using namespace ams; + + if (argc < 8) { + std::cerr << "Wrong CLI\n"; + std::cerr << argv[0] + << " 'use device' 'path to faiss' 'data type (double|float)' " + "'UQPolicy (0:Mean, 1:Max)' 'Num Clusters' 'Threshold' " + "'number of dimensions' 'num elements'"; + abort(); + } + auto &rm = umpire::ResourceManager::getInstance(); + int use_device = std::atoi(argv[1]); + char *faiss_path = argv[2]; + char *data_type = argv[3]; + AMSUQPolicy uq_policy = static_cast(std::atoi(argv[4])); + int nClusters = std::atoi(argv[5]); + float threshold = std::atoi(argv[6]); + int nDims = std::atoi(argv[7]); + int nElements = std::atoi(argv[8]); + + AMSSetupAllocator(AMSResourceType::HOST); + AMSResourceType resource = AMSResourceType::HOST; + if (use_device == 1) { + AMSSetupAllocator(AMSResourceType::DEVICE); + AMSSetDefaultAllocator(AMSResourceType::DEVICE); + resource = AMSResourceType::DEVICE; + } + + if (std::strcmp("double", data_type) == 0) { + std::shared_ptr> cache = HDCache::getInstance( + faiss_path, use_device, uq_policy, 10, threshold); + bool result = + do_faiss(cache, resource, nClusters, nDims, nElements, threshold); + cache.reset(); + return !result; + } else if (std::strcmp("single", data_type) == 0) { + std::shared_ptr> cache = HDCache::getInstance( + faiss_path, use_device, uq_policy, 10, threshold); + bool result = + do_faiss(cache, resource, nClusters, nDims, nElements, threshold); + cache.reset(); + return !result; + } + + + return 0; +}