From 91625acadf2c623667acb5b404a6138d12a7ff65 Mon Sep 17 00:00:00 2001 From: Carlchristian Eckert Date: Tue, 9 Jun 2015 01:40:57 +0200 Subject: [PATCH] added OpenMP support for host-side loops see #51 Some observations: - we have only trivial (and fast) loops - the other loops are integral steps of the simulation and can not be parallelized (sequential steps and maybe with device-code) - one of the parsing loops uses cudaSetDevice, not sure if it's possible to parallelize that in a good way. - parallelizing std::vector is ok as long as the length is fixed (no reallocation). That means, we may not use vector.push_back() or vector.insert() inside a loop with OpenMP pragmas. Compiler might not complain... Only "easy" loops were parallelized, only basic pragmas were used. Might give some speedup one day, but if not... no problem. Pragmas and code changes are non-intrusive enough to keep it maintainable. --- CMakeLists.txt | 1 + src/{interpolation.cu => interpolation.cc} | 86 ++++++++++++---------- src/{parser.cu => parser.cc} | 4 + src/write_to_vtk.cc | 2 + 4 files changed, 54 insertions(+), 39 deletions(-) rename src/{interpolation.cu => interpolation.cc} (77%) rename src/{parser.cu => parser.cc} (99%) diff --git a/CMakeLists.txt b/CMakeLists.txt index f66f553..87e704c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,6 +100,7 @@ set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT}) ################################################################################ # GNU if(CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") diff --git a/src/interpolation.cu b/src/interpolation.cc similarity index 77% rename from src/interpolation.cu rename to src/interpolation.cc index e169126..c213f50 100644 --- a/src/interpolation.cu +++ b/src/interpolation.cc @@ -21,6 +21,7 @@ #include #include +#include /* pragma omp parallel for */ #include #include @@ -102,7 +103,7 @@ std::vector interpolateLinear(const std::vector y, const std::ve // Monochromatic case if(y.size() == 1){ - return std::vector(nInterpolations, y.at(0)); + return std::vector(nInterpolations, y.at(0)); } // Check x data if monoton increasing @@ -110,45 +111,49 @@ std::vector interpolateLinear(const std::vector y, const std::ve // Create equidistant interpolation on x axis std::vector interpolated_x(nInterpolations, 0); + + #pragma omp parallel for for(unsigned i = 0; i < nInterpolations; ++i){ - interpolated_x.at(i) = x_min + (i * (x_range / nInterpolations)); + interpolated_x.at(i) = x_min + (i * (x_range / nInterpolations)); } // Start to interpolate y values for every x value std::vector interpolated_y(nInterpolations, 0); - for(unsigned i = 0; i < interpolated_x.size(); ++i){ - // Get index of points before and after x - double y1_i = getNextSmallerIndex(x, interpolated_x.at(i)); - double y2_i = getNextBiggerIndex(x, interpolated_x.at(i)); - int y_diff = y2_i - y1_i; - - if(y_diff == 1){ - // First point p1=(x1/y1) before x - double x1 = x_min + y1_i; - double y1 = y.at(y1_i); - - // Second point p2=(x2/y2) after x - double x2 = x_min + y2_i; - double y2 = y.at(y2_i); - assert(y.size() >= y1_i); - // linear function between p1 and p2 (y=mx+b) - double m = (y2 - y1) / (x2 / x1); - double b = y1 - (m * x1); - - // Interpolate y from linear function - interpolated_y.at(i) = m * interpolated_x.at(i) + b; + #pragma omp parallel for + for(unsigned i = 0; i < nInterpolations; ++i){ + // Get index of points before and after x + double y1_i = getNextSmallerIndex(x, interpolated_x.at(i)); + double y2_i = getNextBiggerIndex(x, interpolated_x.at(i)); + int y_diff = y2_i - y1_i; + + if(y_diff == 1){ + // First point p1=(x1/y1) before x + double x1 = x_min + y1_i; + double y1 = y.at(y1_i); + + // Second point p2=(x2/y2) after x + double x2 = x_min + y2_i; + double y2 = y.at(y2_i); + assert(y.size() >= y1_i); + + // linear function between p1 and p2 (y=mx+b) + double m = (y2 - y1) / (x2 / x1); + double b = y1 - (m * x1); + + // Interpolate y from linear function + interpolated_y.at(i) = m * interpolated_x.at(i) + b; + + } + else if(y_diff == 2){ + // No interpolation needed + interpolated_y.at(i) = y.at(y1_i + 1); + } + else { + dout(V_ERROR) << "Index of smaller and bigger sigma too seperated" << std::endl; + exit(1); + } - } - else if(y_diff == 2){ - // No interpolation needed - interpolated_y.at(i) = y.at(y1_i + 1); - } - else { - dout(V_ERROR) << "Index of smaller and bigger sigma too seperated" << std::endl; - exit(0); - } - } return interpolated_y; @@ -184,17 +189,20 @@ std::vector interpolateWavelength(const std::vector sigma_y, con } std::vector y(interpolation_range, 0); - const double lambda_range = lambda_stop - lambda_start; + const unsigned lambda_range = lambda_stop - lambda_start; assert(sigma_y.size() >= lambda_range); // Generate sigma_x - std::vector sigma_x; - for(unsigned i = lambda_start; i <= lambda_stop; ++i){ - sigma_x.push_back(i); + std::vector sigma_x(lambda_range, 0); + + #pragma omp parallel for + for(unsigned i = 0; i < lambda_range; ++i){ + sigma_x[i] = i+lambda_start; } + #pragma omp parallel for for(unsigned i = 0; i < interpolation_range; ++i){ - double x = lambda_start + (i * (lambda_range / interpolation_range)); + double x = lambda_start + (i * (static_cast(lambda_range) / interpolation_range)); // Get index of points before and after x double y1_i = getNextSmallerIndex(sigma_x, x); @@ -225,7 +233,7 @@ std::vector interpolateWavelength(const std::vector sigma_y, con } else { dout(V_ERROR) << "Index of smaller and bigger sigma too seperated" << std::endl; - exit(0); + exit(1); } } diff --git a/src/parser.cu b/src/parser.cc similarity index 99% rename from src/parser.cu rename to src/parser.cc index 0a1db2f..441f9da 100644 --- a/src/parser.cu +++ b/src/parser.cc @@ -24,6 +24,7 @@ #include /* vector */ #include #include /* exit() */ +#include /* pragma omp parallel for */ #include #include @@ -331,6 +332,7 @@ Mesh createMesh(const std::vector &triangleIndices, // GPU variables double totalSurface = 0.; + #pragma omp parallel for reduction(+:totalSurface) for(unsigned i=0;i &triangleIndices, std::vector hostCenters(xOfTriangleCenter.begin(), xOfTriangleCenter.end()); hostCenters.insert(hostCenters.end(),yOfTriangleCenter.begin(),yOfTriangleCenter.end()); std::vector totalReflectionAngles(refractiveIndices.size()/2,0); + + #pragma omp parallel for for(unsigned i=0;i /* std::fixed, std::setprecision() */ #include /* std::to_string() */ #include /* time, time_t */ +#include /* pragma omp parallel for */ #include /* fs::path */ #include /* fs::ofstream, fs::ifstream */ @@ -195,6 +196,7 @@ std::vector compareVtk(std::vector compare, const fs::path filen } dout(V_INFO) << "Compare solution with " << filename << std::endl; + #pragma omp parallel for reduction (+:aseTotal) for(unsigned i = 0; i < compare.size(); ++i){ aseTotal += compare.at(i); }