4
\$\begingroup\$

Purpose of this program is to calculate the velocity of ng = nxnynz particles at the Lagrangian points from the values at regular grid nodes. How can i speed up this calculation? I hope to receive suggestions on this following code:

#include <iomanip>
#include <omp.h>
#include <vector>
#include <cmath> 
#include <ctime>
#include <iostream>
#include <string>
#include <fstream>
#include <string.h>
#include <sstream>
#include <istream>
#include <stdlib.h> 
#include <algorithm> 
using namespace std;
void functionM4( vector<double> &b, vector<double> &w) 
// this is an interpolation function
{
 w[0]= 0.5*b[0]*b[0]*b[0]+2.5*b[0]*b[0]+4.0*b[0]+2.0;
 w[1]=-1.5*b[1]*b[1]*b[1]-2.5*b[1]*b[1]+1.0;
 w[2]= 1.5*b[2]*b[2]*b[2]-2.5*b[2]*b[2]+1.0;
 w[3]=-0.5*b[3]*b[3]*b[3]+2.5*b[3]*b[3]-4.0*b[3]+2.0;
}
int main()
{
 double pi = 3.141592653589793;
 int nx0 = 400; // number of grid in X direction
 int ny0 = 400; // number of grid in Y direction
 int nz0 = 400; // number of grid in Y direction
 int nx = nx0+10; 
 int ny = ny0+10; 
 int nz = nz0+10; 
 int ng = nx*ny*nz;
 double xmin = 0.0; // set up domain 
 double xmax = 2.0*pi; 
 double ymin = 0.0; 
 double ymax = 2.0*pi; 
 double zmin = 0.0; 
 double zmax = 2.0*pi; 
 double dx = (xmax-xmin)/(nx0); //spacing step
 double dy = (ymax-ymin)/(ny0); 
 double dz = (zmax-zmin)/(nz0); 
 
 vector<double> x(nx,0.0); 
 vector<double> y(ny,0.0); 
 vector<double> z(nz,0.0); 
 for (int i=0;i<nx;i++)
 x[i] = xmin+(double)(i-5)*dx; //building grid for x direction
 
 for (int j=0;j<ny;j++)
 y[j] = ymin+(double)(j-5)*dy;
 for (int k=0;k<nz;k++)
 z[k] = zmin+(double)(k-5)*dz;
 // ug is velocity component u on grid at the beginning
 vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > vg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > wg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
// particles at Lagrangian points
 vector<vector<vector<double> > > xpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > ypStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > zpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
//the particle velocity at Lagrangian locations
 vector<vector<vector<double> > > upStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > vpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > wpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 
 int i,j,k;
 #pragma omp parallel for private (j,k) schedule(dynamic)
 for (i=0;i<nx;i++)
 for (j=0;j<ny;j++)
 for (k=0;k<nz;k++)
 {
 //initial values of velocity components at grid nodes
 ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]); 
 vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
 wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
 }
 double dt = 0.001; // time step
//the particle velocity upStar at Lagrangian location xpStar is 
//obtained from the velocity ug at the grid nodes (x,y,z) 
//using a kernel interpolation function.
 cout<<" start "<<endl;
 #pragma omp parallel for private (j,k) schedule(dynamic)
 for (i=0;i<nx;i++) 
 for (j=0;j<ny;j++) 
 for (k=0;k<nz;k++){
 xpStar[i][j][k]= x[i] + dt*ug[i][j][k]; 
 ypStar[i][j][k]= y[j] + dt*vg[i][j][k];
 zpStar[i][j][k]= z[k] + dt*wg[i][j][k];
 }
 #pragma omp parallel for private (j,k) schedule(dynamic) 
 for (i=2;i<nx-2;i++) 
 for (j=2;j<ny-2;j++) 
 for (k=2;k<nz-2;k++){
 int Xindex = (int)((xpStar[i][j][k]- x[0])/dx);
 int Yindex = (int)((ypStar[i][j][k]- y[0])/dy);
 int Zindex = (int)((zpStar[i][j][k]- z[0])/dz);
 int west = Xindex-1; int est = Xindex+2;
 int south = Yindex-1; int north = Yindex+2; 
 int front = Zindex-1; int back = Zindex+2; 
 vector<double> BX(4,0.0);
 vector<double> BY(4,0.0);
 vector<double> BZ(4,0.0);
 vector<double> wx(4,0.0);
 vector<double> wy(4,0.0);
 vector<double> wz(4,0.0);
 for (int m=west;m<=est;m++){
 BX[m-west]=(x[m]-xpStar[i][j][k])/dx;}
 
 for (int n=south;n<=north;n++){
 BY[n-south]=(y[n]-ypStar[i][j][k])/dy;}
 for (int q=front;q<=back;q++){
 BZ[q-front]=(z[q]-zpStar[i][j][k])/dz;}
 functionM4(BX,wx);
 functionM4(BY,wy);
 functionM4(BZ,wz);
 for (int m=west;m<=est;m++) 
 for (int n=south;n<=north;n++)
 for (int q=front;q<=back;q++){ 
 
 double w=wx[m-west]*wy[n-south]*wz[q-front];
 upStar[i][j][k] += ug[m][n][q]*w;
 vpStar[i][j][k] += vg[m][n][q]*w;
 wpStar[i][j][k] += wg[m][n][q]*w;
 }
 }
 cout<<" finish ! "<<endl;
 //-------------------------------------Checking results------------------------------------------//
 vector<vector<vector<double> > > exact (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
 vector<vector<vector<double> > > error (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
 #pragma omp parallel for
 for (int i=3;i<nx-3;i++)
 for (int j=3;j<ny-3;j++)
 for (int k=3;k<nz-3;k++)
 {
 exact[i][j][k] = -sin(xpStar[i][j][k])*cos(ypStar[i][j][k])*sin(zpStar[i][j][k]);
 error[i][j][k] = abs(exact[i][j][k]-upStar[i][j][k]);
 }
 double aa = 0.0;
 double bb = 0.0;
 double cc = 0.0;
 double dd = 0.0;
 double maxi = 0.0;
 for (int i=3;i<nx-3;i++)
 for (int j=3;j<ny-3;j++)
 for (int k=3;k<nz-3;k++)
 {
 aa += abs(exact[i][j][k]);
 bb += error[i][j][k];
 cc += error[i][j][k]*error[i][j][k];
 dd += exact[i][j][k]*exact[i][j][k];
 if (abs(error[i][j][k])>maxi)
 maxi = abs(error[i][j][k]);
 }
 cout<<" L1_norm = "<<bb/aa<<endl;
 cout<<" L2_norm = "<<sqrt(cc/dd)<<endl;
 cout<<" L3_norm = "<<maxi<<endl;
 return 0;
//Compiler: g++ interpolation.cpp -o running -fopenmp
//export OMP_NUM_THREADS=30;
//Run:./running
}
asked Oct 21, 2020 at 3:58
\$\endgroup\$
4
  • 1
    \$\begingroup\$ you should use Horner's rule for computing the elements of w, which should save a few multiplications. \$\endgroup\$ Commented Oct 21, 2020 at 4:19
  • \$\begingroup\$ Using Horner's rule rule, it can save 1 or 2 multiplications. Thank you very much @xdavidliu \$\endgroup\$ Commented Oct 21, 2020 at 4:25
  • \$\begingroup\$ I am wrong. It can save 2 or 3 multiplications. \$\endgroup\$ Commented Oct 21, 2020 at 5:26
  • 1
    \$\begingroup\$ might be useful stackoverflow.com/questions/375913/… \$\endgroup\$ Commented Oct 21, 2020 at 11:05

1 Answer 1

3
\$\begingroup\$

(Note: I don't know anything about OMP, so I haven't commented on it!)


double pi = 3.141592653589793;
int nx0 = 400; // number of grid in X direction
...

Constants should be declared const (it helps the programmer, if not the compiler).


for (i=0;i<nx;i++)
 for (j=0;j<ny;j++)
 for (k=0;k<nz;k++)
 {
 ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]); 
 //initial values of velocity components at grid nodes
 vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
 wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
 }

I know this is just the setup phase, but that's a lot of repeated cos and sin calculations for the same values. We could move the calculations into the outer loops, or (better) we could pre-calculate sin(x[n]), cos(x[n]), sin(y[n]), etc. separately.

Iterating over all 3 velocity grids at once is likely to be slower than 3 separate loops to fill each one independently. (Due to cache misses).


vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 

This is quite complicated in terms of memory allocation so it's slow to create (lots of copying of those one and two dimensional vectors) and to iterate (allocated memory is unlikely to be contiguous, so we don't know how to access elements in a cache-efficient manner).

Instead, try allocating a single vector, and calculating the indices from 3d coordinates, something like:

template<class T>
struct array_3d
{
 using index_t = typename std::vector<T>::size_type;
 array_3d(index_t width, index_t height, index_t depth):
 m_width(width), m_height(height), m_depth(depth),
 m_data(m_width * m_height * m_depth) { }
 T& at(index_t x, index_t y, index_t z)
 {
 return m_data[x + y * m_width + z * m_width * m_height];
 }
 
 T const& at(index_t x, index_t y, index_t z) const
 {
 return m_data[x + y * m_width + z * m_width * m_height];
 }
 index_t m_width, m_height, m_depth;
 std::vector<T> m_data;
};

If we're careful in how we iterate this data structure, we can avoid skipping around in memory, e.g.:

for (int k=0;k<nz;k++)
 for (int j=0;j<ny;j++)
 for (int i=0;i<nx;i++)
 xpStar.at(i, j, k)= x[i] + dt*ug.at(i, j, k);

Every access of xpStar.m_data alters the next element in the array. (So we could even avoid calculating the index from the coordinates and just increment an iterator, which may or may not be faster).

(Note: The way I implemented it above, has the x component innermost (so x+1 is index+1). This is the opposite of your initial code (which has z innermost), but it doesn't really matter which you choose, as long as you loop through the dimensions in the right order when iterating).


 vector<double> BX(4,0.0);
 vector<double> BY(4,0.0);
 vector<double> BZ(4,0.0);
 vector<double> wx(4,0.0);
 vector<double> wy(4,0.0);
 vector<double> wz(4,0.0);

Since these are a fixed size, use std::array<double, 4> instead!


 for (int m=west;m<=est;m++) 
 for (int n=south;n<=north;n++)
 for (int q=front;q<=back;q++){ 
 
 double w=wx[m-west]*wy[n-south]*wz[q-front];
 upStar[i][j][k] += ug[m][n][q]*w;
 vpStar[i][j][k] += vg[m][n][q]*w;
 wpStar[i][j][k] += wg[m][n][q]*w;
 }

Again, it might be faster to split this into separate loops for upStar, vpStar and wpStar (and then precalculate w), e.g.:

 array<double, 4*4*4> w;
 
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 w[m + n*4 + q*4*4] = wx[m]*wy[n]*wz[q];
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 upStar.at(i, j, k) += ug.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 vpStar.at(i, j, k) += vg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 wpStar.at(i, j, k) += wg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
answered Oct 21, 2020 at 11:41
\$\endgroup\$
3
  • \$\begingroup\$ Thank you very much for introducing template<class T> struct array_3d replacing 3-D vectors. It is very nice. I tried it, and it works well. Could you explain 'T& at(index_t x, index_t y, index_t z)' and 'T& at(index_t x, index_t y, index_t z) const'? @user673679 \$\endgroup\$ Commented Oct 22, 2020 at 0:29
  • \$\begingroup\$ I tested the code with separation of loops for upStar, vpStar and wpStar, the calculation speed is not better. However, the way code written is very clear, and helpful. \$\endgroup\$ Commented Oct 22, 2020 at 0:46
  • 1
    \$\begingroup\$ Ah yep, that second at() function should return a const&, my bad. \$\endgroup\$ Commented Oct 22, 2020 at 15:22

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.