Purpose of this program is to calculate the velocity of ng = nxnynz particles at the Lagrangian points from the values at regular grid nodes. How can i speed up this calculation? I hope to receive suggestions on this following code:
#include <iomanip>
#include <omp.h>
#include <vector>
#include <cmath>
#include <ctime>
#include <iostream>
#include <string>
#include <fstream>
#include <string.h>
#include <sstream>
#include <istream>
#include <stdlib.h>
#include <algorithm>
using namespace std;
void functionM4( vector<double> &b, vector<double> &w)
// this is an interpolation function
{
w[0]= 0.5*b[0]*b[0]*b[0]+2.5*b[0]*b[0]+4.0*b[0]+2.0;
w[1]=-1.5*b[1]*b[1]*b[1]-2.5*b[1]*b[1]+1.0;
w[2]= 1.5*b[2]*b[2]*b[2]-2.5*b[2]*b[2]+1.0;
w[3]=-0.5*b[3]*b[3]*b[3]+2.5*b[3]*b[3]-4.0*b[3]+2.0;
}
int main()
{
double pi = 3.141592653589793;
int nx0 = 400; // number of grid in X direction
int ny0 = 400; // number of grid in Y direction
int nz0 = 400; // number of grid in Y direction
int nx = nx0+10;
int ny = ny0+10;
int nz = nz0+10;
int ng = nx*ny*nz;
double xmin = 0.0; // set up domain
double xmax = 2.0*pi;
double ymin = 0.0;
double ymax = 2.0*pi;
double zmin = 0.0;
double zmax = 2.0*pi;
double dx = (xmax-xmin)/(nx0); //spacing step
double dy = (ymax-ymin)/(ny0);
double dz = (zmax-zmin)/(nz0);
vector<double> x(nx,0.0);
vector<double> y(ny,0.0);
vector<double> z(nz,0.0);
for (int i=0;i<nx;i++)
x[i] = xmin+(double)(i-5)*dx; //building grid for x direction
for (int j=0;j<ny;j++)
y[j] = ymin+(double)(j-5)*dy;
for (int k=0;k<nz;k++)
z[k] = zmin+(double)(k-5)*dz;
// ug is velocity component u on grid at the beginning
vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
vector<vector<vector<double> > > vg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
vector<vector<vector<double> > > wg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
// particles at Lagrangian points
vector<vector<vector<double> > > xpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
vector<vector<vector<double> > > ypStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
vector<vector<vector<double> > > zpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
//the particle velocity at Lagrangian locations
vector<vector<vector<double> > > upStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
vector<vector<vector<double> > > vpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
vector<vector<vector<double> > > wpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
int i,j,k;
#pragma omp parallel for private (j,k) schedule(dynamic)
for (i=0;i<nx;i++)
for (j=0;j<ny;j++)
for (k=0;k<nz;k++)
{
//initial values of velocity components at grid nodes
ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]);
vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
}
double dt = 0.001; // time step
//the particle velocity upStar at Lagrangian location xpStar is
//obtained from the velocity ug at the grid nodes (x,y,z)
//using a kernel interpolation function.
cout<<" start "<<endl;
#pragma omp parallel for private (j,k) schedule(dynamic)
for (i=0;i<nx;i++)
for (j=0;j<ny;j++)
for (k=0;k<nz;k++){
xpStar[i][j][k]= x[i] + dt*ug[i][j][k];
ypStar[i][j][k]= y[j] + dt*vg[i][j][k];
zpStar[i][j][k]= z[k] + dt*wg[i][j][k];
}
#pragma omp parallel for private (j,k) schedule(dynamic)
for (i=2;i<nx-2;i++)
for (j=2;j<ny-2;j++)
for (k=2;k<nz-2;k++){
int Xindex = (int)((xpStar[i][j][k]- x[0])/dx);
int Yindex = (int)((ypStar[i][j][k]- y[0])/dy);
int Zindex = (int)((zpStar[i][j][k]- z[0])/dz);
int west = Xindex-1; int est = Xindex+2;
int south = Yindex-1; int north = Yindex+2;
int front = Zindex-1; int back = Zindex+2;
vector<double> BX(4,0.0);
vector<double> BY(4,0.0);
vector<double> BZ(4,0.0);
vector<double> wx(4,0.0);
vector<double> wy(4,0.0);
vector<double> wz(4,0.0);
for (int m=west;m<=est;m++){
BX[m-west]=(x[m]-xpStar[i][j][k])/dx;}
for (int n=south;n<=north;n++){
BY[n-south]=(y[n]-ypStar[i][j][k])/dy;}
for (int q=front;q<=back;q++){
BZ[q-front]=(z[q]-zpStar[i][j][k])/dz;}
functionM4(BX,wx);
functionM4(BY,wy);
functionM4(BZ,wz);
for (int m=west;m<=est;m++)
for (int n=south;n<=north;n++)
for (int q=front;q<=back;q++){
double w=wx[m-west]*wy[n-south]*wz[q-front];
upStar[i][j][k] += ug[m][n][q]*w;
vpStar[i][j][k] += vg[m][n][q]*w;
wpStar[i][j][k] += wg[m][n][q]*w;
}
}
cout<<" finish ! "<<endl;
//-------------------------------------Checking results------------------------------------------//
vector<vector<vector<double> > > exact (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
vector<vector<vector<double> > > error (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
#pragma omp parallel for
for (int i=3;i<nx-3;i++)
for (int j=3;j<ny-3;j++)
for (int k=3;k<nz-3;k++)
{
exact[i][j][k] = -sin(xpStar[i][j][k])*cos(ypStar[i][j][k])*sin(zpStar[i][j][k]);
error[i][j][k] = abs(exact[i][j][k]-upStar[i][j][k]);
}
double aa = 0.0;
double bb = 0.0;
double cc = 0.0;
double dd = 0.0;
double maxi = 0.0;
for (int i=3;i<nx-3;i++)
for (int j=3;j<ny-3;j++)
for (int k=3;k<nz-3;k++)
{
aa += abs(exact[i][j][k]);
bb += error[i][j][k];
cc += error[i][j][k]*error[i][j][k];
dd += exact[i][j][k]*exact[i][j][k];
if (abs(error[i][j][k])>maxi)
maxi = abs(error[i][j][k]);
}
cout<<" L1_norm = "<<bb/aa<<endl;
cout<<" L2_norm = "<<sqrt(cc/dd)<<endl;
cout<<" L3_norm = "<<maxi<<endl;
return 0;
//Compiler: g++ interpolation.cpp -o running -fopenmp
//export OMP_NUM_THREADS=30;
//Run:./running
}
1 Answer 1
(Note: I don't know anything about OMP, so I haven't commented on it!)
double pi = 3.141592653589793;
int nx0 = 400; // number of grid in X direction
...
Constants should be declared const
(it helps the programmer, if not the compiler).
for (i=0;i<nx;i++)
for (j=0;j<ny;j++)
for (k=0;k<nz;k++)
{
ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]);
//initial values of velocity components at grid nodes
vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
}
I know this is just the setup phase, but that's a lot of repeated cos
and sin
calculations for the same values. We could move the calculations into the outer loops, or (better) we could pre-calculate sin(x[n])
, cos(x[n])
, sin(y[n])
, etc. separately.
Iterating over all 3 velocity grids at once is likely to be slower than 3 separate loops to fill each one independently. (Due to cache misses).
vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
This is quite complicated in terms of memory allocation so it's slow to create (lots of copying of those one and two dimensional vectors) and to iterate (allocated memory is unlikely to be contiguous, so we don't know how to access elements in a cache-efficient manner).
Instead, try allocating a single vector, and calculating the indices from 3d coordinates, something like:
template<class T>
struct array_3d
{
using index_t = typename std::vector<T>::size_type;
array_3d(index_t width, index_t height, index_t depth):
m_width(width), m_height(height), m_depth(depth),
m_data(m_width * m_height * m_depth) { }
T& at(index_t x, index_t y, index_t z)
{
return m_data[x + y * m_width + z * m_width * m_height];
}
T const& at(index_t x, index_t y, index_t z) const
{
return m_data[x + y * m_width + z * m_width * m_height];
}
index_t m_width, m_height, m_depth;
std::vector<T> m_data;
};
If we're careful in how we iterate this data structure, we can avoid skipping around in memory, e.g.:
for (int k=0;k<nz;k++)
for (int j=0;j<ny;j++)
for (int i=0;i<nx;i++)
xpStar.at(i, j, k)= x[i] + dt*ug.at(i, j, k);
Every access of xpStar.m_data
alters the next element in the array. (So we could even avoid calculating the index from the coordinates and just increment an iterator, which may or may not be faster).
(Note: The way I implemented it above, has the x
component innermost (so x+1
is index+1
). This is the opposite of your initial code (which has z
innermost), but it doesn't really matter which you choose, as long as you loop through the dimensions in the right order when iterating).
vector<double> BX(4,0.0);
vector<double> BY(4,0.0);
vector<double> BZ(4,0.0);
vector<double> wx(4,0.0);
vector<double> wy(4,0.0);
vector<double> wz(4,0.0);
Since these are a fixed size, use std::array<double, 4>
instead!
for (int m=west;m<=est;m++)
for (int n=south;n<=north;n++)
for (int q=front;q<=back;q++){
double w=wx[m-west]*wy[n-south]*wz[q-front];
upStar[i][j][k] += ug[m][n][q]*w;
vpStar[i][j][k] += vg[m][n][q]*w;
wpStar[i][j][k] += wg[m][n][q]*w;
}
Again, it might be faster to split this into separate loops for upStar
, vpStar
and wpStar
(and then precalculate w
), e.g.:
array<double, 4*4*4> w;
for (int q=0;q!=4;q++)
for (int n=0;n!=4;n++)
for (int m=0;m!=4;m++)
w[m + n*4 + q*4*4] = wx[m]*wy[n]*wz[q];
for (int q=0;q!=4;q++)
for (int n=0;n!=4;n++)
for (int m=0;m!=4;m++)
upStar.at(i, j, k) += ug.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
for (int q=0;q!=4;q++)
for (int n=0;n!=4;n++)
for (int m=0;m!=4;m++)
vpStar.at(i, j, k) += vg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
for (int q=0;q!=4;q++)
for (int n=0;n!=4;n++)
for (int m=0;m!=4;m++)
wpStar.at(i, j, k) += wg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
-
\$\begingroup\$ Thank you very much for introducing template<class T> struct array_3d replacing 3-D vectors. It is very nice. I tried it, and it works well. Could you explain 'T& at(index_t x, index_t y, index_t z)' and 'T& at(index_t x, index_t y, index_t z) const'? @user673679 \$\endgroup\$Luc Nguyen– Luc Nguyen2020年10月22日 00:29:02 +00:00Commented Oct 22, 2020 at 0:29
-
\$\begingroup\$ I tested the code with separation of loops for
upStar
,vpStar
andwpStar
, the calculation speed is not better. However, the way code written is very clear, and helpful. \$\endgroup\$Luc Nguyen– Luc Nguyen2020年10月22日 00:46:17 +00:00Commented Oct 22, 2020 at 0:46 -
1\$\begingroup\$ Ah yep, that second
at()
function should return aconst&
, my bad. \$\endgroup\$user673679– user6736792020年10月22日 15:22:02 +00:00Commented Oct 22, 2020 at 15:22
w
, which should save a few multiplications. \$\endgroup\$