From 91625acadf2c623667acb5b404a6138d12a7ff65 Mon Sep 17 00:00:00 2001
From: Carlchristian Eckert <Carli-Eckert@gmx.de>
Date: Tue, 9 Jun 2015 01:40:57 +0200
Subject: [PATCH] added OpenMP support for host-side loops

 see #51

 Some observations:
  - we have only trivial (and fast) loops
  - the other loops are integral steps of the simulation and can not be
    parallelized (sequential steps and maybe with device-code)
  - one of the parsing loops uses cudaSetDevice, not sure if it's
    possible to parallelize that in a good way.
  - parallelizing std::vector is ok as long as the length is fixed (no
    reallocation). That means, we may not use vector.push_back() or
    vector.insert() inside a loop with OpenMP pragmas. Compiler might
    not complain...

 Only "easy" loops were parallelized, only basic pragmas were used.
 Might give some speedup one day, but if not... no problem. Pragmas and
 code changes are non-intrusive enough to keep it maintainable.
---
 CMakeLists.txt                             |  1 +
 src/{interpolation.cu => interpolation.cc} | 86 ++++++++++++----------
 src/{parser.cu => parser.cc}               |  4 +
 src/write_to_vtk.cc                        |  2 +
 4 files changed, 54 insertions(+), 39 deletions(-)
 rename src/{interpolation.cu => interpolation.cc} (77%)
 rename src/{parser.cu => parser.cc} (99%)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f66f553..87e704c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,6 +100,7 @@ set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
 ################################################################################
 # GNU
 if(CMAKE_COMPILER_IS_GNUCXX)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")  
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
diff --git a/src/interpolation.cu b/src/interpolation.cc
similarity index 77%
rename from src/interpolation.cu
rename to src/interpolation.cc
index e169126..c213f50 100644
--- a/src/interpolation.cu
+++ b/src/interpolation.cc
@@ -21,6 +21,7 @@
 
 #include <vector>
 #include <assert.h>
+#include <omp.h> /* pragma omp parallel for */
 
 #include <interpolation.hpp>
 #include <logging.hpp>
@@ -102,7 +103,7 @@ std::vector<double> interpolateLinear(const std::vector<double> y, const std::ve
 
     // Monochromatic case
     if(y.size() == 1){
-	return std::vector<double>(nInterpolations, y.at(0));
+      return std::vector<double>(nInterpolations, y.at(0));
     }
 
     // Check x data if monoton increasing
@@ -110,45 +111,49 @@ std::vector<double> interpolateLinear(const std::vector<double> y, const std::ve
 
     // Create equidistant interpolation on x axis
     std::vector<double> interpolated_x(nInterpolations, 0);
+
+    #pragma omp parallel for
     for(unsigned i = 0; i < nInterpolations; ++i){
-	interpolated_x.at(i) = x_min + (i * (x_range / nInterpolations));
+      interpolated_x.at(i) = x_min + (i * (x_range / nInterpolations));
     }
 
     // Start to interpolate y values for every x value
     std::vector<double> interpolated_y(nInterpolations, 0);
-    for(unsigned i = 0; i < interpolated_x.size(); ++i){
-	// Get index of points before and after x
-	double y1_i = getNextSmallerIndex(x, interpolated_x.at(i));
-	double y2_i = getNextBiggerIndex(x, interpolated_x.at(i));
-	int y_diff = y2_i - y1_i;
-
-	if(y_diff == 1){
-	    // First point p1=(x1/y1) before x
-	    double x1 = x_min + y1_i;
-	    double y1 = y.at(y1_i);
-
-	    // Second point p2=(x2/y2) after x
-	    double x2 = x_min + y2_i;
-	    double y2 = y.at(y2_i);
-	    assert(y.size() >= y1_i);
 
-	    // linear function between p1 and p2 (y=mx+b)
-	    double m = (y2 - y1) / (x2 / x1);
-	    double b = y1 - (m * x1);
-
-	    // Interpolate y from linear function
-	    interpolated_y.at(i) = m * interpolated_x.at(i) + b;
+    #pragma omp parallel for
+    for(unsigned i = 0; i < nInterpolations; ++i){
+      // Get index of points before and after x
+      double y1_i = getNextSmallerIndex(x, interpolated_x.at(i));
+      double y2_i = getNextBiggerIndex(x, interpolated_x.at(i));
+      int y_diff = y2_i - y1_i;
+
+      if(y_diff == 1){
+        // First point p1=(x1/y1) before x
+        double x1 = x_min + y1_i;
+        double y1 = y.at(y1_i);
+
+        // Second point p2=(x2/y2) after x
+        double x2 = x_min + y2_i;
+        double y2 = y.at(y2_i);
+        assert(y.size() >= y1_i);
+
+        // linear function between p1 and p2 (y=mx+b)
+        double m = (y2 - y1) / (x2 / x1);
+        double b = y1 - (m * x1);
+
+        // Interpolate y from linear function
+        interpolated_y.at(i) = m * interpolated_x.at(i) + b;
+
+      }
+      else if(y_diff == 2){
+        // No interpolation needed
+        interpolated_y.at(i) = y.at(y1_i + 1);
+      }
+      else {
+        dout(V_ERROR) << "Index of smaller and bigger sigma too seperated" << std::endl;
+        exit(1);
+      }
 
-	}
-	else if(y_diff == 2){
-	    // No interpolation needed
-	    interpolated_y.at(i) = y.at(y1_i + 1);
-	}
-	else {
-	    dout(V_ERROR) << "Index of smaller and bigger sigma too seperated" << std::endl;
-	    exit(0);
-	}
-    
     }
   
     return interpolated_y;
@@ -184,17 +189,20 @@ std::vector<double> interpolateWavelength(const std::vector<double> sigma_y, con
   }
 
   std::vector<double> y(interpolation_range, 0);
-  const double lambda_range = lambda_stop - lambda_start;
+  const unsigned lambda_range = lambda_stop - lambda_start;
   assert(sigma_y.size() >= lambda_range);
 
   // Generate sigma_x
-  std::vector<double> sigma_x;
-  for(unsigned i = lambda_start; i <= lambda_stop; ++i){
-    sigma_x.push_back(i);
+  std::vector<double> sigma_x(lambda_range, 0);
+
+  #pragma omp parallel for
+  for(unsigned i = 0; i < lambda_range; ++i){
+    sigma_x[i] = i+lambda_start;
   }
   
+  #pragma omp parallel for
   for(unsigned i = 0; i < interpolation_range; ++i){
-    double x = lambda_start + (i * (lambda_range / interpolation_range));
+    double x = lambda_start + (i * (static_cast<double>(lambda_range) / interpolation_range));
 
     // Get index of points before and after x
     double y1_i = getNextSmallerIndex(sigma_x, x);
@@ -225,7 +233,7 @@ std::vector<double> interpolateWavelength(const std::vector<double> sigma_y, con
     }
     else {
       dout(V_ERROR) << "Index of smaller and bigger sigma too seperated" << std::endl;
-      exit(0);
+      exit(1);
     }
     
   }
diff --git a/src/parser.cu b/src/parser.cc
similarity index 99%
rename from src/parser.cu
rename to src/parser.cc
index 0a1db2f..441f9da 100644
--- a/src/parser.cu
+++ b/src/parser.cc
@@ -24,6 +24,7 @@
 #include <vector> /* vector */
 #include <assert.h>
 #include <stdlib.h> /* exit() */
+#include <omp.h> /* pragma omp parallel for */
 
 #include <logging.hpp> 
 #include <mesh.hpp>
@@ -331,6 +332,7 @@ Mesh createMesh(const std::vector<unsigned> &triangleIndices,
   // GPU variables
   double totalSurface = 0.;
 
+  #pragma omp parallel for reduction(+:totalSurface)
   for(unsigned i=0;i<numberOfTriangles;++i){
     totalSurface+=double(surfacesVector.at(i));	
   }
@@ -341,6 +343,8 @@ Mesh createMesh(const std::vector<unsigned> &triangleIndices,
   std::vector<double> hostCenters(xOfTriangleCenter.begin(), xOfTriangleCenter.end());
   hostCenters.insert(hostCenters.end(),yOfTriangleCenter.begin(),yOfTriangleCenter.end());
   std::vector<float> totalReflectionAngles(refractiveIndices.size()/2,0);
+
+  #pragma omp parallel for
   for(unsigned i=0;i<refractiveIndices.size();i+=2){
     totalReflectionAngles.at(i/2) = (180. / M_PI *  asin(refractiveIndices.at(i+1) / refractiveIndices.at(i)));
   }
diff --git a/src/write_to_vtk.cc b/src/write_to_vtk.cc
index 10d49ee..fafc950 100644
--- a/src/write_to_vtk.cc
+++ b/src/write_to_vtk.cc
@@ -24,6 +24,7 @@
 #include <iomanip> /* std::fixed, std::setprecision() */
 #include <string> /* std::to_string() */
 #include <time.h> /* time, time_t */
+#include <omp.h> /* pragma omp parallel for */
 
 #include <boost/filesystem.hpp> /* fs::path */
 #include <boost/filesystem/fstream.hpp> /* fs::ofstream, fs::ifstream */
@@ -195,6 +196,7 @@ std::vector<double> compareVtk(std::vector<double> compare, const fs::path filen
   }
   dout(V_INFO) << "Compare solution with " << filename << std::endl;
 
+  #pragma omp parallel for reduction (+:aseTotal)
   for(unsigned i = 0; i < compare.size(); ++i){
     aseTotal += compare.at(i);
   }