diff --git a/examples/main.cpp b/examples/main.cpp
index e7ae3e87..4c6e740f 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -700,6 +700,15 @@ int main(int argc, char **argv)
     CALIPER(CALI_MARK_END("Cycle");)
     MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
   }
+
+  delete[] workflow;
+
+  // TODO: Add smart-pointers
+  for (int mat_idx = 0; mat_idx < num_mats; ++mat_idx) {
+    delete eoses[mat_idx];
+    eoses[mat_idx] = nullptr;
+  }
+
   CALIPER(CALI_MARK_END("TimeStepLoop"););
   MPI_CALL(MPI_Finalize());
   return 0;
diff --git a/src/ml/hdcache.hpp b/src/ml/hdcache.hpp
index a610fcff..cfaa9548 100644
--- a/src/ml/hdcache.hpp
+++ b/src/ml/hdcache.hpp
@@ -248,7 +248,15 @@ class HDCache
     return new_cache;
   }
 
-  ~HDCache() { DBG(Surrogate, "Destroying UQ-cache") }
+  ~HDCache()
+  {
+    DBG(UQModule, "Deleting UQ-Module");
+    if (m_index) {
+      DBG(UQModule, "Deleting HD-Cache");
+      m_index->reset();
+      delete m_index;
+    }
+  }
 
   //! ------------------------------------------------------------------------
   //! simple queries
@@ -400,6 +408,7 @@ class HDCache
     } else {
       _evaluate(ndata, data, is_acceptable);
     }
+    DBG(UQModule, "Done with evalution of uq")
   }
 
   //! train on data that comes separate features (a vector of pointers)
@@ -431,6 +440,7 @@ class HDCache
       _evaluate(ndata, lin_data, is_acceptable);
       ams::ResourceManager::deallocate(lin_data, defaultRes);
     }
+    DBG(UQModule, "Done with evalution of uq");
   }
 
 private:
@@ -529,28 +539,35 @@ class HDCache
     for (int start = 0; start < ndata; start += MAGIC_NUMBER) {
       unsigned int nElems =
           ((ndata - start) < MAGIC_NUMBER) ? ndata - start : MAGIC_NUMBER;
+      DBG(UQModule, "Running for %d elements %d %d", nElems, start, m_dim);
       m_index->search(nElems,
-                      &data[start],
+                      &data[start * m_dim],
                       knbrs,
                       &kdists[start * knbrs],
                       &kidxs[start * knbrs]);
     }
+#ifdef __ENABLE_CUDA__
+    faiss::gpu::synchronizeAllDevices();
+#endif
 
     // compute means
     if (defaultRes == AMSResourceType::HOST) {
-      TypeValue total_dist = 0;
       for (size_t i = 0; i < ndata; ++i) {
         CFATAL(UQModule,
                m_policy == AMSUQPolicy::DeltaUQ,
                "DeltaUQ is not supported yet");
         if (m_policy == AMSUQPolicy::FAISSMean) {
-          total_dist =
-              std::accumulate(kdists + i * knbrs, kdists + (i + 1) * knbrs, 0.);
-          is_acceptable[i] = (ook * total_dist) < acceptable_error;
+          TypeValue mean_dist = std::accumulate(kdists + i * knbrs,
+                                                kdists + (i + 1) * knbrs,
+                                                0.) *
+                                ook;
+          is_acceptable[i] = mean_dist < acceptable_error;
         } else if (m_policy == AMSUQPolicy::FAISSMax) {
           // Take the furtherst cluster as the distance metric
-          total_dist = kdists[i * knbrs + knbrs - 1];
-          is_acceptable[i] = (total_dist) < acceptable_error;
+          TypeValue max_dist =
+              *std::max_element(&kdists[i * knbrs],
+                                &kdists[i * knbrs + knbrs - 1]);
+          is_acceptable[i] = (max_dist) < acceptable_error;
         }
       }
     } else {
diff --git a/src/wf/cuda/utilities.cuh b/src/wf/cuda/utilities.cuh
index 76818857..7f10f26c 100644
--- a/src/wf/cuda/utilities.cuh
+++ b/src/wf/cuda/utilities.cuh
@@ -23,13 +23,9 @@
 const int warpSize = 32;
 const unsigned int fullMask = 0xffffffff;
 
-__host__ int divup(int x, int y) {
-    return (x + y - 1) / y;
-}
+__host__ int divup(int x, int y) { return (x + y - 1) / y; }
 
-__device__ __inline__ int pow2i(int e) {
-    return 1 << e;
-}
+__device__ __inline__ int pow2i(int e) { return 1 << e; }
 
 // Define this to turn on error checking
 #define CUDA_ERROR_CHECK
@@ -37,207 +33,247 @@ __device__ __inline__ int pow2i(int e) {
 #define CUDASAFECALL(err) __cudaSafeCall(err, __FILE__, __LINE__)
 #define CUDACHECKERROR() __cudaCheckError(__FILE__, __LINE__)
 
-inline void __cudaSafeCall(cudaError err, const char* file, const int line) {
+inline void __cudaSafeCall(cudaError err, const char* file, const int line)
+{
 #ifdef CUDA_ERROR_CHECK
-    if (cudaSuccess != err) {
-        fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n", file, line,
-                cudaGetErrorString(err));
-
-        fprintf(stdout, "cudaSafeCall() failed at %s:%i : %s\n", file, line,
-                cudaGetErrorString(err));
-        exit(-1);
-    }
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "cudaSafeCall() failed at %s:%i : %s\n",
+            file,
+            line,
+            cudaGetErrorString(err));
+
+    fprintf(stdout,
+            "cudaSafeCall() failed at %s:%i : %s\n",
+            file,
+            line,
+            cudaGetErrorString(err));
+    exit(-1);
+  }
 #endif
 
-    return;
+  return;
 }
 
 struct is_true {
-    __host__ __device__ bool operator()(const int x) { return x; }
+  __host__ __device__ bool operator()(const int x) { return x; }
 };
 
 struct is_false {
-    __host__ __device__ bool operator()(const int x) { return !x; }
+  __host__ __device__ bool operator()(const int x) { return !x; }
 };
 
 
-inline void __cudaCheckError(const char* file, const int line) {
+inline void __cudaCheckError(const char* file, const int line)
+{
 #ifdef CUDA_ERROR_CHECK
-    cudaError err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line,
-                cudaGetErrorString(err));
-        exit(-1);
-    }
-
-    // More careful checking. However, this will affect performance.
-    // Comment away if needed.
-    err = cudaDeviceSynchronize();
-    if (cudaSuccess != err) {
-        fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file, line,
-                cudaGetErrorString(err));
-        exit(-1);
-    }
+  cudaError err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "cudaCheckError() failed at %s:%i : %s\n",
+            file,
+            line,
+            cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  // More careful checking. However, this will affect performance.
+  // Comment away if needed.
+  err = cudaDeviceSynchronize();
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "cudaCheckError() with sync failed at %s:%i : %s\n",
+            file,
+            line,
+            cudaGetErrorString(err));
+    exit(-1);
+  }
 #endif
 
-    return;
+  return;
 }
 
-__global__ void srand_dev(curandState* states, const int total_threads) {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    if (id < total_threads) {
-        int seed = id;  // different seed per thread
-        curand_init(seed, id, 0, &states[id]);
-    }
+__global__ void srand_dev(curandState* states, const int total_threads)
+{
+  int id = threadIdx.x + blockDim.x * blockIdx.x;
+  if (id < total_threads) {
+    int seed = id;  // different seed per thread
+    curand_init(seed, id, 0, &states[id]);
+  }
 }
 
-__global__ void initIndices(int* ind, int length) {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    if (id < length)
-        ind[id] = id;
+__global__ void initIndices(int* ind, int length)
+{
+  int id = threadIdx.x + blockDim.x * blockIdx.x;
+  if (id < length) ind[id] = id;
 }
 
 template <typename T>
-__global__ void fillRandom(bool* predicate, const int total_threads, curandState* states,
-                           const size_t length, T threshold) {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    if (id < total_threads) {
-        for (int i = id; i < length; i += total_threads) {
-            float x = curand_uniform(&states[id]);
-            predicate[i] = (x <= threshold);
-        }
+__global__ void fillRandom(bool* predicate,
+                           const int total_threads,
+                           curandState* states,
+                           const size_t length,
+                           T threshold)
+{
+  int id = threadIdx.x + blockDim.x * blockIdx.x;
+  if (id < total_threads) {
+    for (int i = id; i < length; i += total_threads) {
+      float x = curand_uniform(&states[id]);
+      predicate[i] = (x <= threshold);
     }
+  }
 }
 
 template <typename T>
-__global__ void computeBlockCounts(bool cond, T* d_input, int length, int* d_BlockCounts) {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx < length) {
-        int pred = ( d_input[idx] == cond );
-        int BC = __syncthreads_count(pred);
-
-        if (threadIdx.x == 0) {
-            d_BlockCounts[blockIdx.x] =
-                BC;  // BC will contain the number of valid elements in all threads of this thread block
-        }
+__global__ void computeBlockCounts(bool cond,
+                                   T* d_input,
+                                   int length,
+                                   int* d_BlockCounts)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < length) {
+    int pred = (d_input[idx] == cond);
+    int BC = __syncthreads_count(pred);
+
+    if (threadIdx.x == 0) {
+      d_BlockCounts[blockIdx.x] =
+          BC;  // BC will contain the number of valid elements in all threads of this thread block
     }
+  }
 }
 
 template <typename T>
-__global__ void assignK(T** sparse, T** dense, int* indices, size_t length, int dims,
-                        bool isReverse) {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx < length) {
-        int index = indices[idx];
-        if (!isReverse) {
-            for (int i = 0; i < dims; i++) {
-                dense[i][idx] = sparse[i][index];
-            }
-        } else {
-            for (int i = 0; i < dims; i++) {
-                sparse[i][index] = dense[i][idx];
-            }
-        }
+__global__ void assignK(T** sparse,
+                        T** dense,
+                        int* indices,
+                        size_t length,
+                        int dims,
+                        bool isReverse)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < length) {
+    int index = indices[idx];
+    if (!isReverse) {
+      for (int i = 0; i < dims; i++) {
+        dense[i][idx] = sparse[i][index];
+      }
+    } else {
+      for (int i = 0; i < dims; i++) {
+        sparse[i][index] = dense[i][idx];
+      }
     }
+  }
 }
 
 template <typename T>
-__global__ void compactK(bool cond, T** d_input,
+__global__ void compactK(bool cond,
+                         T** d_input,
                          T** d_output,
                          const bool* predicates,
                          const size_t length,
                          int dims,
                          int* d_BlocksOffset,
-                         bool reverse) {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    extern __shared__ int warpTotals[];
-    if (idx < length) {
-        int pred = (predicates[idx] == cond);
-        int w_i = threadIdx.x / warpSize;  //warp index
-        int w_l = idx % warpSize;          //thread index within a warp
-
-        // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp
-        int t_m = fullMask >> (warpSize - w_l);  //thread mask
+                         bool reverse)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int warpTotals[];
+  if (idx < length) {
+    int pred = (predicates[idx] == cond);
+    int w_i = threadIdx.x / warpSize;  //warp index
+    int w_l = idx % warpSize;          //thread index within a warp
+
+    // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp
+    int t_m = fullMask >> (warpSize - w_l);  //thread mask
 #if (CUDART_VERSION < 9000)
-        int b = __ballot(pred) & t_m;  //ballot result = number whose ith bit
-                                       //is one if the ith's thread pred is true
-                                       //masked up to the current index in warp
+    int b = __ballot(pred) & t_m;  //ballot result = number whose ith bit
+                                   //is one if the ith's thread pred is true
+                                   //masked up to the current index in warp
 #else
-        int b = __ballot_sync(fullMask, pred) & t_m;
+    int b = __ballot_sync(fullMask, pred) & t_m;
 #endif
-        int t_u = __popc(
-            b);  // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
-
-        // last thread in warp computes total valid counts for the warp
-        if (w_l == warpSize - 1) {
-            warpTotals[w_i] = t_u + pred;
-        }
-
-        // need all warps in thread block to fill in warpTotals before proceeding
-        __syncthreads();
-
-        // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block
-        int numWarps = blockDim.x / warpSize;
-        unsigned int numWarpsMask = fullMask >> (warpSize - numWarps);
-        if (w_i == 0 && w_l < numWarps) {
-            int w_i_u = 0;
-            for ( int j = 0; j <= 5; j++) {
+    int t_u = __popc(
+        b);  // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
+
+    // last thread in warp computes total valid counts for the warp
+    if (w_l == warpSize - 1) {
+      warpTotals[w_i] = t_u + pred;
+    }
+
+    // need all warps in thread block to fill in warpTotals before proceeding
+    __syncthreads();
+
+    // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block
+    int numWarps = blockDim.x / warpSize;
+    unsigned int numWarpsMask = fullMask >> (warpSize - numWarps);
+    if (w_i == 0 && w_l < numWarps) {
+      int w_i_u = 0;
+      for (int j = 0; j <= 5; j++) {
 #if (CUDART_VERSION < 9000)
-                int b_j = __ballot(warpTotals[w_l] & pow2i(j));  //# of the ones in the j'th digit of the warp offsets
+        int b_j = __ballot(
+            warpTotals[w_l] &
+            pow2i(j));  //# of the ones in the j'th digit of the warp offsets
 #else
-                int b_j = __ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j));
+        int b_j = __ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j));
 #endif
-                w_i_u += (__popc(b_j & t_m)) << j;
-            }
-            warpTotals[w_l] = w_i_u;
-        }
-
-        // need all warps in thread block to wait until prefix sum is calculated in warpTotals
-        __syncthreads();
-
-        // if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid
-        if (pred) {
-            if (!reverse) {
-                for (int i = 0; i < dims; i++)
-                    d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]] =
-                        d_input[i][idx];
-            } else {
-                for (int i = 0; i < dims; i++)
-                    d_input[i][idx] =
-                        d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]];
-            }
-        }
+        w_i_u += (__popc(b_j & t_m)) << j;
+      }
+      warpTotals[w_l] = w_i_u;
     }
-}
 
+    // need all warps in thread block to wait until prefix sum is calculated in warpTotals
+    __syncthreads();
+
+    // if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid
+    if (pred) {
+      if (!reverse) {
+        for (int i = 0; i < dims; i++)
+          d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]] =
+              d_input[i][idx];
+      } else {
+        for (int i = 0; i < dims; i++)
+          d_input[i][idx] =
+              d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]];
+      }
+    }
+  }
+}
 
-template<typename TypeInValue, typename TypeOutValue>
-void __global__ linearizeK(TypeOutValue *output, const TypeInValue * const *inputs, size_t dims, size_t elements){
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if ( idx >= elements )
-      return;
 
-    for (int i = 0; i < dims; i++ ){
-      output[ idx * dims + i] = static_cast<TypeOutValue>(inputs[i][idx]);
-    }
+template <typename TypeInValue, typename TypeOutValue>
+void __global__ linearizeK(TypeOutValue* output,
+                           const TypeInValue* const* inputs,
+                           size_t dims,
+                           size_t elements)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= elements) return;
 
+  for (int i = 0; i < dims; i++) {
+    output[idx * dims + i] = static_cast<TypeOutValue>(inputs[i][idx]);
+  }
 }
 
 
-void __global__ compute_predicate( float *data, bool *predicate, size_t nData, const size_t kneigh, float threshold){
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if ( idx >= nData )
-      return;
+void __global__ compute_predicate(float* data,
+                                  bool* predicate,
+                                  size_t nData,
+                                  const size_t kneigh,
+                                  float threshold)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= nData) return;
 
-    int index = idx * kneigh;
-    float acc = 0.0f;
-    for (int i = 0; i < kneigh; i++)
-      acc += data[index + i];
-    acc /= static_cast<float>(kneigh);
+  int index = idx * kneigh;
+  float acc = 0.0f;
+  for (int i = 0; i < kneigh; i++) {
+    acc += data[index + i];
+  }
 
-    bool pred = acc < threshold ? true : false;
+  acc /= static_cast<float>(kneigh);
 
-    predicate[idx] = pred;
+  bool pred = acc < threshold ? true : false;
+
+  predicate[idx] = pred;
 }
 
 template <typename T>
@@ -248,107 +284,176 @@ int compact(bool cond,
             const size_t length,
             int dims,
             int blockSize,
-            bool isReverse = false) {
-    int numBlocks = divup(length, blockSize);
-    int* d_BlocksCount = ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::DEVICE);
-    int* d_BlocksOffset = ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::DEVICE);
-    // determine number of elements in the compacted list
-    int* h_BlocksCount = ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::HOST);
-    int* h_BlocksOffset = ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::HOST);
-
-    T** d_dense  = ams::ResourceManager::allocate<T*>(dims, AMSResourceType::DEVICE);
-    T** d_sparse = ams::ResourceManager::allocate<T*>(dims, AMSResourceType::DEVICE);
-
-    ams::ResourceManager::registerExternal(dense, sizeof(T*) * dims, AMSResourceType::HOST);
-    ams::ResourceManager::registerExternal(sparse, sizeof(T*) * dims, AMSResourceType::HOST);
-    ams::ResourceManager::copy(dense, d_dense);
-    ams::ResourceManager::copy(const_cast<T**> (sparse), d_sparse);
-    thrust::device_ptr<int> thrustPrt_bCount(d_BlocksCount);
-    thrust::device_ptr<int> thrustPrt_bOffset(d_BlocksOffset);
-
-    //phase 1: count number of valid elements in each thread block
-    computeBlockCounts<<<numBlocks, blockSize>>>(cond, dPredicate, length, d_BlocksCount);
-
-    //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid
-    thrust::exclusive_scan(thrust::device, d_BlocksCount, d_BlocksCount + numBlocks, d_BlocksOffset);
-
-    //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements
-    compactK<<<numBlocks, blockSize, sizeof(int) * (blockSize / warpSize)>>>( cond,
-        d_sparse, d_dense, dPredicate, length, dims, d_BlocksOffset, isReverse);
-
-    ams::ResourceManager::copy(d_BlocksCount, h_BlocksCount);
-    ams::ResourceManager::copy(d_BlocksOffset, h_BlocksOffset);
-    int compact_length = h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1];
-
-    ams::ResourceManager::deallocate(d_BlocksCount, AMSResourceType::DEVICE);
-    ams::ResourceManager::deallocate(d_BlocksOffset, AMSResourceType::DEVICE);
-
-    ams::ResourceManager::deallocate(h_BlocksCount, AMSResourceType::HOST);
-    ams::ResourceManager::deallocate(h_BlocksOffset, AMSResourceType::HOST);
-
-    ams::ResourceManager::deallocate(d_dense, AMSResourceType::DEVICE);
-    ams::ResourceManager::deallocate(d_sparse, AMSResourceType::DEVICE);
-
-    ams::ResourceManager::deregisterExternal(dense);
-    ams::ResourceManager::deregisterExternal(sparse);
-    cudaDeviceSynchronize();
+            bool isReverse = false)
+{
+  int numBlocks = divup(length, blockSize);
+  int* d_BlocksCount =
+      ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::DEVICE);
+  int* d_BlocksOffset =
+      ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::DEVICE);
+  // determine number of elements in the compacted list
+  int* h_BlocksCount =
+      ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::HOST);
+  int* h_BlocksOffset =
+      ams::ResourceManager::allocate<int>(numBlocks, AMSResourceType::HOST);
+
+  T** d_dense =
+      ams::ResourceManager::allocate<T*>(dims, AMSResourceType::DEVICE);
+  T** d_sparse =
+      ams::ResourceManager::allocate<T*>(dims, AMSResourceType::DEVICE);
+
+  ams::ResourceManager::registerExternal(dense,
+                                         sizeof(T*) * dims,
+                                         AMSResourceType::HOST);
+  ams::ResourceManager::registerExternal(sparse,
+                                         sizeof(T*) * dims,
+                                         AMSResourceType::HOST);
+  ams::ResourceManager::copy(dense, d_dense);
+  ams::ResourceManager::copy(const_cast<T**>(sparse), d_sparse);
+  thrust::device_ptr<int> thrustPrt_bCount(d_BlocksCount);
+  thrust::device_ptr<int> thrustPrt_bOffset(d_BlocksOffset);
+
+  //phase 1: count number of valid elements in each thread block
+  computeBlockCounts<<<numBlocks, blockSize>>>(cond,
+                                               dPredicate,
+                                               length,
+                                               d_BlocksCount);
+
+  //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid
+  thrust::exclusive_scan(thrust::device,
+                         d_BlocksCount,
+                         d_BlocksCount + numBlocks,
+                         d_BlocksOffset);
+
+  //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements
+  compactK<<<numBlocks, blockSize, sizeof(int) * (blockSize / warpSize)>>>(
+      cond,
+      d_sparse,
+      d_dense,
+      dPredicate,
+      length,
+      dims,
+      d_BlocksOffset,
+      isReverse);
+
+  ams::ResourceManager::copy(d_BlocksCount, h_BlocksCount);
+  ams::ResourceManager::copy(d_BlocksOffset, h_BlocksOffset);
+  int compact_length =
+      h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1];
+
+  ams::ResourceManager::deallocate(d_BlocksCount, AMSResourceType::DEVICE);
+  ams::ResourceManager::deallocate(d_BlocksOffset, AMSResourceType::DEVICE);
+
+  ams::ResourceManager::deallocate(h_BlocksCount, AMSResourceType::HOST);
+  ams::ResourceManager::deallocate(h_BlocksOffset, AMSResourceType::HOST);
+
+  ams::ResourceManager::deallocate(d_dense, AMSResourceType::DEVICE);
+  ams::ResourceManager::deallocate(d_sparse, AMSResourceType::DEVICE);
+
+  ams::ResourceManager::deregisterExternal(dense);
+  ams::ResourceManager::deregisterExternal(sparse);
+  cudaDeviceSynchronize();
 
-    return compact_length;
+  return compact_length;
 }
 
 template <typename T>
-int compact(bool cond, T** sparse, T** dense, int* indices, const size_t length, int dims, int blockSize,
-            const bool* dPredicate, bool isReverse = false) {
-    int numBlocks = divup(length, blockSize);
-    size_t sparseElements = length;
-
-    if (!isReverse) {
-        initIndices<<<numBlocks, blockSize>>>(indices, length);
-        if ( cond ){
-          auto last = thrust::copy_if(thrust::device, indices, indices + sparseElements, dPredicate,
-                                      indices, is_true());
-          sparseElements = last - indices;
-        }
-        else{
-          auto last = thrust::copy_if(thrust::device, indices, indices + sparseElements, dPredicate,
-                                      indices, is_false());
-          sparseElements = last - indices;
-        }
+int compact(bool cond,
+            T** sparse,
+            T** dense,
+            int* indices,
+            const size_t length,
+            int dims,
+            int blockSize,
+            const bool* dPredicate,
+            bool isReverse = false)
+{
+  int numBlocks = divup(length, blockSize);
+  size_t sparseElements = length;
+
+  if (!isReverse) {
+    initIndices<<<numBlocks, blockSize>>>(indices, length);
+    if (cond) {
+      auto last = thrust::copy_if(thrust::device,
+                                  indices,
+                                  indices + sparseElements,
+                                  dPredicate,
+                                  indices,
+                                  is_true());
+      sparseElements = last - indices;
+    } else {
+      auto last = thrust::copy_if(thrust::device,
+                                  indices,
+                                  indices + sparseElements,
+                                  dPredicate,
+                                  indices,
+                                  is_false());
+      sparseElements = last - indices;
     }
+  }
 
-    assignK<<<numBlocks, blockSize>>>(sparse, dense, indices, sparseElements, dims, isReverse);
-    cudaDeviceSynchronize();
+  assignK<<<numBlocks, blockSize>>>(
+      sparse, dense, indices, sparseElements, dims, isReverse);
+  cudaDeviceSynchronize();
 
-    return sparseElements;
+  return sparseElements;
 }
 
-template<typename TypeInValue, typename TypeOutValue>
-void device_linearize(TypeOutValue *output, const TypeInValue * const *inputs, size_t dims, size_t elements){
-// TODO: Fix "magic number".
+template <typename TypeInValue, typename TypeOutValue>
+void device_linearize(TypeOutValue* output,
+                      const TypeInValue* const* inputs,
+                      size_t dims,
+                      size_t elements)
+{
+  // TODO: Fix "magic number".
   const int NT = 256;
-// TODO: We should add a max number of blocks typically this should be around 3K.
-  int NB = (elements + NT - 1 ) / NT;
+  // TODO: We should add a max number of blocks typically this should be around 3K.
+  int NB = (elements + NT - 1) / NT;
+  DBG(Device,
+      "Linearize using %ld blocks %ld threads to transpose %ld, %ld matrix",
+      NB,
+      NT,
+      dims,
+      elements);
+
   linearizeK<<<NB, NT>>>(output, inputs, dims, elements);
+  cudaDeviceSynchronize();
 }
 
 template <typename T>
-void cuda_rand_init(bool* predicate, const size_t length, T threshold) {
-    static curandState* dev_random = NULL;
-    const int TS = 4096;
-    const int BS = 128;
-    int numBlocks = divup(TS, BS);
-    if (!dev_random) {
-        dev_random = ams::ResourceManager::allocate<curandState>(4096);
-        srand_dev<<<numBlocks, BS>>>(dev_random, TS);
-    }
-
-    fillRandom<<<numBlocks, BS>>>(predicate, TS, dev_random, length, threshold);
-    cudaDeviceSynchronize();
+void cuda_rand_init(bool* predicate, const size_t length, T threshold)
+{
+  static curandState* dev_random = NULL;
+  const int TS = 4096;
+  const int BS = 128;
+  int numBlocks = divup(TS, BS);
+  if (!dev_random) {
+    dev_random = ams::ResourceManager::allocate<curandState>(4096);
+    srand_dev<<<numBlocks, BS>>>(dev_random, TS);
+  }
+
+  DBG(Device,
+      "Random Fill using %ld blocks %ld threads to randomly initialize %ld "
+      "elements",
+      numBlocks,
+      BS,
+      length);
+  fillRandom<<<numBlocks, BS>>>(predicate, TS, dev_random, length, threshold);
+  cudaDeviceSynchronize();
 }
 
-void device_compute_predicate( float *data, bool *predicate, size_t nData, const size_t kneigh, float threshold){
+void device_compute_predicate(float* data,
+                              bool* predicate,
+                              size_t nData,
+                              const size_t kneigh,
+                              float threshold)
+{
   const int NT = 256;
-  int NB = (nData + NT - 1 ) / NT;
+  int NB = (nData + NT - 1) / NT;
+  DBG(Device,
+      "Compute predicate for %d elements with threshold %f",
+      nData,
+      threshold);
   compute_predicate<<<NB, NT>>>(data, predicate, nData, kneigh, threshold);
   cudaDeviceSynchronize();
 }
diff --git a/src/wf/resource_manager.hpp b/src/wf/resource_manager.hpp
index 3d99ddc4..b5db514e 100644
--- a/src/wf/resource_manager.hpp
+++ b/src/wf/resource_manager.hpp
@@ -141,7 +141,7 @@ PERFFASPECT()
   static TypeInValue* allocate(size_t nvalues, AMSResourceType dev = default_resource)
   {
     static auto& rm = umpire::ResourceManager::getInstance();
-    DBG(ResourceManager, "Requesting to allocate %ld values using allocator :%s %d", nvalues, getAllocatorName(dev));
+    DBG(ResourceManager, "Requesting to allocate %ld values using allocator :%s", nvalues, getAllocatorName(dev));
     auto alloc = rm.getAllocator(allocator_ids[dev]);
     TypeInValue *ret = static_cast<TypeInValue*>(alloc.allocate(nvalues * sizeof(TypeInValue)));
     CFATAL(ResourceManager, ret == nullptr,
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 268ba56d..b79a7890 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -24,5 +24,18 @@ endfunction()
 
 ADDTEST(ams_allocator ams_allocate.cpp AMSAllocate)
 ADDTEST(ams_packing cpu_packing_test.cpp AMSPack)
+if (WITH_TORCH)
 ADDTEST(ams_inference_double torch_model.cpp AMSInferDouble ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt "double")
 ADDTEST(ams_inference_single torch_model.cpp AMSInferSingle ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt "single")
+endif()
+
+if(WITH_FAISS)
+ADDTEST(ams_hdcache_mean_double test_hdcache.cpp AMSHDCacheMeanPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 0 10 4.0 4 5)
+# The max case fails on DEVICE. We should be aware abou this when adding support for CI for GPUs
+ADDTEST(ams_hdcache_max_double  test_hdcache.cpp AMSHDCacheMaxPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 1 10 4.0 4 5)
+
+ADDTEST(ams_hdcache_mean_single test_hdcache.cpp AMSHDCacheMeanPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 0 10 4.0 4 5)
+# The max case fails on DEVICE. We should be aware abou this when adding support for CI for GPUs
+ADDTEST(ams_hdcache_max_single  test_hdcache.cpp AMSHDCacheMaxPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 1 10 4.0 4 5)
+endif()
+
diff --git a/tests/faiss_debug.pt b/tests/faiss_debug.pt
new file mode 100644
index 00000000..7ea0c6eb
Binary files /dev/null and b/tests/faiss_debug.pt differ
diff --git a/tests/generate_faiss.py b/tests/generate_faiss.py
new file mode 100644
index 00000000..05d66ff1
--- /dev/null
+++ b/tests/generate_faiss.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
+# AMSLib Project Developers
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import argparse
+import numpy as np
+import faiss  # make faiss available
+
+
+def create_elements(nb, d, distance=10, offset=0):
+    centers = []
+    for i in range(nb):
+        c = ((i + 1) * distance) + offset
+        centers.append(np.random.uniform(low=c - 0.5, high=c + 0.5, size=(100, d)))
+
+    xb = np.vstack(centers).astype("float32")
+    return xb
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Faiss index generator for test purposes"
+    )
+    parser.add_argument(
+        "-d", "--dimensions", type=int, help="Dimensions of our vectors", default=4
+    )
+    parser.add_argument(
+        "-dst",
+        "--distance",
+        type=int,
+        help="Jump between centers in Faiss index",
+        default=10,
+    )
+    parser.add_argument(
+        "-f",
+        "--file",
+        type=str,
+        help="File to store FAISS index",
+        default="faiss_debug.pt",
+    )
+    parser.add_argument(
+        "-c",
+        "--num-centers",
+        type=int,
+        help="Number of centers in FAISS index",
+        default=10,
+    )
+    args = parser.parse_args()
+
+    xb = create_elements(args.num_centers, args.dimensions)
+    index = faiss.IndexFlatL2(args.dimensions)  # build the index
+    index.add(xb)  # add vectors to the index
+    faiss.write_index(index, args.file)
+    # print(xb)
+
+    # k=1
+    # xq_false = create_elements(args.num_centers, args.dimensions, args.distance, 5)
+    # xq_true = create_elements(args.num_centers,args.dimensions)
+    # xq = np.vstack((xq_false, xq_true)).astype('float32')
+    # print("Search for")
+    # print(xq)
+    # D, I = index.search(xq, k)
+    # print("Distance is", D)
+    # print("Indexes are", I)
+
+    # predicate = np.any(D < 4, axis=1)
+    # print(predicate)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_hdcache.cpp b/tests/test_hdcache.cpp
new file mode 100644
index 00000000..b26de8fe
--- /dev/null
+++ b/tests/test_hdcache.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
+ * AMSLib Project Developers
+ *
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include <AMS.h>
+
+#include <climits>
+#include <cstring>
+#include <iostream>
+#include <ml/hdcache.hpp>
+#include <umpire/ResourceManager.hpp>
+#include <umpire/Umpire.hpp>
+#include <vector>
+#include <wf/resource_manager.hpp>
+
+template <typename T>
+std::vector<const T *> generate_vectors(const int num_clusters,
+                                        int elements,
+                                        int dims)
+{
+  std::vector<const T *> v_data;
+  // This are fixed to mimic the way the faiss was generated
+  // The code below generates data values that are either within
+  // the distance of the faiss index or just outside of it.
+  const T distance = 10.0;
+  const T offset = 5.0;
+  for (int i = 0; i < dims; i++) {
+    T *data = ams::ResourceManager::allocate<T>(num_clusters * elements,
+                                                AMSResourceType::HOST);
+    for (int j = 0; j < elements; j++) {
+      // Generate a value for every cluster center
+      for (int k = 0; k < num_clusters; k++) {
+        T tmp = ((T)rand()) / INT_MAX;
+        tmp += (k + 1) * num_clusters;
+        if ((j % 2) == 0) {
+          tmp += offset;
+        }
+        data[j * num_clusters + k] = tmp;
+      }
+    }
+    v_data.push_back(data);
+  }
+  return std::move(v_data);
+}
+
+template <typename T>
+void print_vectors(std::vector<T *> &vec, int num_elements, int num_clusters)
+{
+  for (int i = 0; i < num_elements; i++) {
+    for (int c = 0; c < num_clusters; c++) {
+      for (auto v : vec) {
+        std::cout << v[i * num_clusters + c] << ":";
+      }
+      std::cout << "\n";
+    }
+  }
+}
+
+
+bool validate(const int num_clusters, const int elements, bool *predicates)
+{
+  bool res = true;
+  for (int j = 0; j < elements; j++) {
+    // Generate a value for every cluster center
+    for (int k = 0; k < num_clusters; k++) {
+      if (j % 2 == 0 && predicates[j * num_clusters + k] == true) {
+        res = false;
+      } else if (j % 2 == 1 && predicates[j * num_clusters + k] == false) {
+        res = false;
+      }
+    }
+  }
+  return res;
+}
+
+template <typename T>
+bool do_faiss(std::shared_ptr<HDCache<T>> &index,
+              AMSResourceType resource,
+              int nClusters,
+              int nDims,
+              int nElements,
+              float threshold)
+{
+
+  std::vector<const T *> orig_data =
+      generate_vectors<T>(nClusters, nElements, nDims);
+  std::vector<const T *> data = orig_data;
+
+  bool *predicates =
+      ams::ResourceManager::allocate<bool>(nClusters * nElements, resource);
+
+  if (resource == AMSResourceType::DEVICE) {
+    for (int i = 0; i < orig_data.size(); i++) {
+      T *d_data =
+          ams::ResourceManager::allocate<T>(nClusters * nElements, resource);
+      ams::ResourceManager::copy(const_cast<T *>(orig_data[i]),
+                                 d_data,
+                                 nClusters * nElements * sizeof(T));
+      data[i] = d_data;
+    }
+  }
+
+
+  index->evaluate(nClusters * nElements, data, predicates);
+
+  bool *h_predicates = predicates;
+
+  if (resource == AMSResourceType::DEVICE) {
+    h_predicates = ams::ResourceManager::allocate<bool>(nClusters * nElements,
+                                                        AMSResourceType::HOST);
+    ams::ResourceManager::copy(predicates, h_predicates, nClusters * nElements);
+    for (auto d : data) {
+      ams::ResourceManager::deallocate(const_cast<T *>(d),
+                                       AMSResourceType::DEVICE);
+    }
+    ams::ResourceManager::deallocate(predicates, AMSResourceType::DEVICE);
+  }
+
+
+  for (auto h_d : orig_data)
+    ams::ResourceManager::deallocate(const_cast<T *>(h_d),
+                                     AMSResourceType::HOST);
+
+  bool res = validate(nClusters, nElements, h_predicates);
+
+  ams::ResourceManager::deallocate(h_predicates, AMSResourceType::HOST);
+  return res;
+}
+
+
+int main(int argc, char *argv[])
+{
+  using namespace ams;
+
+  if (argc < 8) {
+    std::cerr << "Wrong CLI\n";
+    std::cerr << argv[0]
+              << " 'use device' 'path to faiss' 'data type (double|float)' "
+                 "'UQPolicy (0:Mean, 1:Max)' 'Num Clusters' 'Threshold' "
+                 "'number of dimensions' 'num elements'";
+    abort();
+  }
+  auto &rm = umpire::ResourceManager::getInstance();
+  int use_device = std::atoi(argv[1]);
+  char *faiss_path = argv[2];
+  char *data_type = argv[3];
+  AMSUQPolicy uq_policy = static_cast<AMSUQPolicy>(std::atoi(argv[4]));
+  int nClusters = std::atoi(argv[5]);
+  float threshold = std::atoi(argv[6]);
+  int nDims = std::atoi(argv[7]);
+  int nElements = std::atoi(argv[8]);
+
+  AMSSetupAllocator(AMSResourceType::HOST);
+  AMSResourceType resource = AMSResourceType::HOST;
+  if (use_device == 1) {
+    AMSSetupAllocator(AMSResourceType::DEVICE);
+    AMSSetDefaultAllocator(AMSResourceType::DEVICE);
+    resource = AMSResourceType::DEVICE;
+  }
+
+  if (std::strcmp("double", data_type) == 0) {
+    std::shared_ptr<HDCache<double>> cache = HDCache<double>::getInstance(
+        faiss_path, use_device, uq_policy, 10, threshold);
+    bool result =
+        do_faiss(cache, resource, nClusters, nDims, nElements, threshold);
+    cache.reset();
+    return !result;
+  } else if (std::strcmp("single", data_type) == 0) {
+    std::shared_ptr<HDCache<float>> cache = HDCache<float>::getInstance(
+        faiss_path, use_device, uq_policy, 10, threshold);
+    bool result =
+        do_faiss(cache, resource, nClusters, nDims, nElements, threshold);
+    cache.reset();
+    return !result;
+  }
+
+
+  return 0;
+}