added OpenMP support for host-side loops

see ComputationalRadiationPhysics#51 Some observations: - we have only trivial (and fast) loops - the other loops are integral steps of the simulation and can not be parallelized (sequential steps and maybe with device-code) - one of the parsing loops uses cudaSetDevice, not sure if it's possible to parallelize that in a good way. - parallelizing std::vector is ok as long as the length is fixed (no reallocation). That means, we may not use vector.push_back() or vector.insert() inside a loop with OpenMP pragmas. Compiler might not complain... Only "easy" loops were parallelized, only basic pragmas were used. Might give some speedup one day, but if not... no problem. Pragmas and code changes are non-intrusive enough to keep it maintainable.
slizzered · Jun 8, 2015 · 91625ac · 91625ac
1 parent 9e9e59a
commit 91625ac
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 39 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -100,6 +100,7 @@ set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
 ################################################################################
 # GNU
 if(CMAKE_COMPILER_IS_GNUCXX)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")

diff --git a/src/interpolation.cu → src/interpolation.cc b/src/interpolation.cu → src/interpolation.cc
@@ -21,6 +21,7 @@
 
 #include <vector>
 #include <assert.h>
+#include <omp.h> /* pragma omp parallel for */
 
 #include <interpolation.hpp>
 #include <logging.hpp>
@@ -102,53 +103,57 @@ std::vector<double> interpolateLinear(const std::vector<double> y, const std::ve
 
  // Monochromatic case
  if(y.size() == 1){
- return std::vector<double>(nInterpolations, y.at(0));
+ return std::vector<double>(nInterpolations, y.at(0));
  }
 
  // Check x data if monoton increasing
  assert(isStrictlySorted(x));
 
  // Create equidistant interpolation on x axis
  std::vector<double> interpolated_x(nInterpolations, 0);
+
+ #pragma omp parallel for
  for(unsigned i = 0; i < nInterpolations; ++i){
- interpolated_x.at(i) = x_min + (i * (x_range / nInterpolations));
+ interpolated_x.at(i) = x_min + (i * (x_range / nInterpolations));
  }
 
  // Start to interpolate y values for every x value
  std::vector<double> interpolated_y(nInterpolations, 0);
- for(unsigned i = 0; i < interpolated_x.size(); ++i){
- // Get index of points before and after x
- double y1_i = getNextSmallerIndex(x, interpolated_x.at(i));
- double y2_i = getNextBiggerIndex(x, interpolated_x.at(i));
- int y_diff = y2_i - y1_i;
-
- if(y_diff == 1){
- // First point p1=(x1/y1) before x
- double x1 = x_min + y1_i;
- double y1 = y.at(y1_i);
-
- // Second point p2=(x2/y2) after x
- double x2 = x_min + y2_i;
- double y2 = y.at(y2_i);
- assert(y.size() >= y1_i);
 
- // linear function between p1 and p2 (y=mx+b)
- double m = (y2 - y1) / (x2 / x1);
- double b = y1 - (m * x1);
-
- // Interpolate y from linear function
- interpolated_y.at(i) = m * interpolated_x.at(i) + b;
+ #pragma omp parallel for
+ for(unsigned i = 0; i < nInterpolations; ++i){
+ // Get index of points before and after x
+ double y1_i = getNextSmallerIndex(x, interpolated_x.at(i));
+ double y2_i = getNextBiggerIndex(x, interpolated_x.at(i));
+ int y_diff = y2_i - y1_i;
+
+ if(y_diff == 1){
+ // First point p1=(x1/y1) before x
+ double x1 = x_min + y1_i;
+ double y1 = y.at(y1_i);
+
+ // Second point p2=(x2/y2) after x
+ double x2 = x_min + y2_i;
+ double y2 = y.at(y2_i);
+ assert(y.size() >= y1_i);
+
+ // linear function between p1 and p2 (y=mx+b)
+ double m = (y2 - y1) / (x2 / x1);
+ double b = y1 - (m * x1);
+
+ // Interpolate y from linear function
+ interpolated_y.at(i) = m * interpolated_x.at(i) + b;
+
+ }
+ else if(y_diff == 2){
+ // No interpolation needed
+ interpolated_y.at(i) = y.at(y1_i + 1);
+ }
+ else {
+ dout(V_ERROR) << "Index of smaller and bigger sigma too seperated" << std::endl;
+ exit(1);
+ }
 
- }
- else if(y_diff == 2){
- // No interpolation needed
- interpolated_y.at(i) = y.at(y1_i + 1);
- }
- else {
- dout(V_ERROR) << "Index of smaller and bigger sigma too seperated" << std::endl;
- exit(0);
- }
-
  }
 
  return interpolated_y;
@@ -184,17 +189,20 @@ std::vector<double> interpolateWavelength(const std::vector<double> sigma_y, con
  }
 
  std::vector<double> y(interpolation_range, 0);
- const double lambda_range = lambda_stop - lambda_start;
+ const unsigned lambda_range = lambda_stop - lambda_start;
  assert(sigma_y.size() >= lambda_range);
 
  // Generate sigma_x
- std::vector<double> sigma_x;
- for(unsigned i = lambda_start; i <= lambda_stop; ++i){
- sigma_x.push_back(i);
+ std::vector<double> sigma_x(lambda_range, 0);
+
+ #pragma omp parallel for
+ for(unsigned i = 0; i < lambda_range; ++i){
+ sigma_x[i] = i+lambda_start;
  }
 
+ #pragma omp parallel for
  for(unsigned i = 0; i < interpolation_range; ++i){
- double x = lambda_start + (i * (lambda_range / interpolation_range));
+ double x = lambda_start + (i * (static_cast<double>(lambda_range) / interpolation_range));
 
  // Get index of points before and after x
  double y1_i = getNextSmallerIndex(sigma_x, x);
@@ -225,7 +233,7 @@ std::vector<double> interpolateWavelength(const std::vector<double> sigma_y, con
  }
  else {
  dout(V_ERROR) << "Index of smaller and bigger sigma too seperated" << std::endl;
- exit(0);
+ exit(1);
  }
 
  }

diff --git a/src/parser.cu → src/parser.cc b/src/parser.cu → src/parser.cc
@@ -24,6 +24,7 @@
 #include <vector> /* vector */
 #include <assert.h>
 #include <stdlib.h> /* exit() */
+#include <omp.h> /* pragma omp parallel for */
 
 #include <logging.hpp> 
 #include <mesh.hpp>
@@ -331,6 +332,7 @@ Mesh createMesh(const std::vector<unsigned> &triangleIndices,
  // GPU variables
  double totalSurface = 0.;
 
+ #pragma omp parallel for reduction(+:totalSurface)
  for(unsigned i=0;i<numberOfTriangles;++i){
  totalSurface+=double(surfacesVector.at(i)); 
  }
@@ -341,6 +343,8 @@ Mesh createMesh(const std::vector<unsigned> &triangleIndices,
  std::vector<double> hostCenters(xOfTriangleCenter.begin(), xOfTriangleCenter.end());
  hostCenters.insert(hostCenters.end(),yOfTriangleCenter.begin(),yOfTriangleCenter.end());
  std::vector<float> totalReflectionAngles(refractiveIndices.size()/2,0);
+
+ #pragma omp parallel for
  for(unsigned i=0;i<refractiveIndices.size();i+=2){
  totalReflectionAngles.at(i/2) = (180. / M_PI * asin(refractiveIndices.at(i+1) / refractiveIndices.at(i)));
  }

diff --git a/src/write_to_vtk.cc b/src/write_to_vtk.cc
@@ -24,6 +24,7 @@
 #include <iomanip> /* std::fixed, std::setprecision() */
 #include <string> /* std::to_string() */
 #include <time.h> /* time, time_t */
+#include <omp.h> /* pragma omp parallel for */
 
 #include <boost/filesystem.hpp> /* fs::path */
 #include <boost/filesystem/fstream.hpp> /* fs::ofstream, fs::ifstream */
@@ -195,6 +196,7 @@ std::vector<double> compareVtk(std::vector<double> compare, const fs::path filen
  }
  dout(V_INFO) << "Compare solution with " << filename << std::endl;
 
+ #pragma omp parallel for reduction (+:aseTotal)
  for(unsigned i = 0; i < compare.size(); ++i){
  aseTotal += compare.at(i);
  }