Kernel Interpolation C++

Question 1

Purpose of this program is to calculate the velocity of ng = nxnynz particles at the Lagrangian points from the values at regular grid nodes. How can i speed up this calculation? I hope to receive suggestions on this following code:

#include <iomanip>
#include <omp.h>
#include <vector>
#include <cmath> 
#include <ctime>
#include <iostream>
#include <string>
#include <fstream>
#include <string.h>
#include <sstream>
#include <istream>
#include <stdlib.h> 
#include <algorithm> 
using namespace std;
void functionM4( vector<double> &b, vector<double> &w) 
// this is an interpolation function
{
 w[0]= 0.5*b[0]*b[0]*b[0]+2.5*b[0]*b[0]+4.0*b[0]+2.0;
 w[1]=-1.5*b[1]*b[1]*b[1]-2.5*b[1]*b[1]+1.0;
 w[2]= 1.5*b[2]*b[2]*b[2]-2.5*b[2]*b[2]+1.0;
 w[3]=-0.5*b[3]*b[3]*b[3]+2.5*b[3]*b[3]-4.0*b[3]+2.0;
}
int main()
{
 double pi = 3.141592653589793;
 int nx0 = 400; // number of grid in X direction
 int ny0 = 400; // number of grid in Y direction
 int nz0 = 400; // number of grid in Y direction
 int nx = nx0+10; 
 int ny = ny0+10; 
 int nz = nz0+10; 
 int ng = nx*ny*nz;
 double xmin = 0.0; // set up domain 
 double xmax = 2.0*pi; 
 double ymin = 0.0; 
 double ymax = 2.0*pi; 
 double zmin = 0.0; 
 double zmax = 2.0*pi; 
 double dx = (xmax-xmin)/(nx0); //spacing step
 double dy = (ymax-ymin)/(ny0); 
 double dz = (zmax-zmin)/(nz0); 
 
 vector<double> x(nx,0.0); 
 vector<double> y(ny,0.0); 
 vector<double> z(nz,0.0); 
 for (int i=0;i<nx;i++)
 x[i] = xmin+(double)(i-5)*dx; //building grid for x direction
 
 for (int j=0;j<ny;j++)
 y[j] = ymin+(double)(j-5)*dy;
 for (int k=0;k<nz;k++)
 z[k] = zmin+(double)(k-5)*dz;
 // ug is velocity component u on grid at the beginning
 vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > vg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > wg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
// particles at Lagrangian points
 vector<vector<vector<double> > > xpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > ypStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > zpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
//the particle velocity at Lagrangian locations
 vector<vector<vector<double> > > upStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > vpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 vector<vector<vector<double> > > wpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
 
 int i,j,k;
 #pragma omp parallel for private (j,k) schedule(dynamic)
 for (i=0;i<nx;i++)
 for (j=0;j<ny;j++)
 for (k=0;k<nz;k++)
 {
 //initial values of velocity components at grid nodes
 ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]); 
 vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
 wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
 }
 double dt = 0.001; // time step
//the particle velocity upStar at Lagrangian location xpStar is 
//obtained from the velocity ug at the grid nodes (x,y,z) 
//using a kernel interpolation function.
 cout<<" start "<<endl;
 #pragma omp parallel for private (j,k) schedule(dynamic)
 for (i=0;i<nx;i++) 
 for (j=0;j<ny;j++) 
 for (k=0;k<nz;k++){
 xpStar[i][j][k]= x[i] + dt*ug[i][j][k]; 
 ypStar[i][j][k]= y[j] + dt*vg[i][j][k];
 zpStar[i][j][k]= z[k] + dt*wg[i][j][k];
 }
 #pragma omp parallel for private (j,k) schedule(dynamic) 
 for (i=2;i<nx-2;i++) 
 for (j=2;j<ny-2;j++) 
 for (k=2;k<nz-2;k++){
 int Xindex = (int)((xpStar[i][j][k]- x[0])/dx);
 int Yindex = (int)((ypStar[i][j][k]- y[0])/dy);
 int Zindex = (int)((zpStar[i][j][k]- z[0])/dz);
 int west = Xindex-1; int est = Xindex+2;
 int south = Yindex-1; int north = Yindex+2; 
 int front = Zindex-1; int back = Zindex+2; 
 vector<double> BX(4,0.0);
 vector<double> BY(4,0.0);
 vector<double> BZ(4,0.0);
 vector<double> wx(4,0.0);
 vector<double> wy(4,0.0);
 vector<double> wz(4,0.0);
 for (int m=west;m<=est;m++){
 BX[m-west]=(x[m]-xpStar[i][j][k])/dx;}
 
 for (int n=south;n<=north;n++){
 BY[n-south]=(y[n]-ypStar[i][j][k])/dy;}
 for (int q=front;q<=back;q++){
 BZ[q-front]=(z[q]-zpStar[i][j][k])/dz;}
 functionM4(BX,wx);
 functionM4(BY,wy);
 functionM4(BZ,wz);
 for (int m=west;m<=est;m++) 
 for (int n=south;n<=north;n++)
 for (int q=front;q<=back;q++){ 
 
 double w=wx[m-west]*wy[n-south]*wz[q-front];
 upStar[i][j][k] += ug[m][n][q]*w;
 vpStar[i][j][k] += vg[m][n][q]*w;
 wpStar[i][j][k] += wg[m][n][q]*w;
 }
 }
 cout<<" finish ! "<<endl;
 //-------------------------------------Checking results------------------------------------------//
 vector<vector<vector<double> > > exact (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
 vector<vector<vector<double> > > error (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
 #pragma omp parallel for
 for (int i=3;i<nx-3;i++)
 for (int j=3;j<ny-3;j++)
 for (int k=3;k<nz-3;k++)
 {
 exact[i][j][k] = -sin(xpStar[i][j][k])*cos(ypStar[i][j][k])*sin(zpStar[i][j][k]);
 error[i][j][k] = abs(exact[i][j][k]-upStar[i][j][k]);
 }
 double aa = 0.0;
 double bb = 0.0;
 double cc = 0.0;
 double dd = 0.0;
 double maxi = 0.0;
 for (int i=3;i<nx-3;i++)
 for (int j=3;j<ny-3;j++)
 for (int k=3;k<nz-3;k++)
 {
 aa += abs(exact[i][j][k]);
 bb += error[i][j][k];
 cc += error[i][j][k]*error[i][j][k];
 dd += exact[i][j][k]*exact[i][j][k];
 if (abs(error[i][j][k])>maxi)
 maxi = abs(error[i][j][k]);
 }
 cout<<" L1_norm = "<<bb/aa<<endl;
 cout<<" L2_norm = "<<sqrt(cc/dd)<<endl;
 cout<<" L3_norm = "<<maxi<<endl;
 return 0;
//Compiler: g++ interpolation.cpp -o running -fopenmp
//export OMP_NUM_THREADS=30;
//Run:./running
}

Question 2

you should use Horner's rule for computing the elements of w, which should save a few multiplications.

Question 3

Using Horner's rule rule, it can save 1 or 2 multiplications. Thank you very much @xdavidliu

Question 4

I am wrong. It can save 2 or 3 multiplications.

Question 5

might be useful stackoverflow.com/questions/375913/…

Question 6

(Note: I don't know anything about OMP, so I haven't commented on it!)

double pi = 3.141592653589793;
int nx0 = 400; // number of grid in X direction
...

Constants should be declared const (it helps the programmer, if not the compiler).

for (i=0;i<nx;i++)
 for (j=0;j<ny;j++)
 for (k=0;k<nz;k++)
 {
 ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]); 
 //initial values of velocity components at grid nodes
 vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
 wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
 }

I know this is just the setup phase, but that's a lot of repeated cos and sin calculations for the same values. We could move the calculations into the outer loops, or (better) we could pre-calculate sin(x[n]), cos(x[n]), sin(y[n]), etc. separately.

Iterating over all 3 velocity grids at once is likely to be slower than 3 separate loops to fill each one independently. (Due to cache misses).

vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));

This is quite complicated in terms of memory allocation so it's slow to create (lots of copying of those one and two dimensional vectors) and to iterate (allocated memory is unlikely to be contiguous, so we don't know how to access elements in a cache-efficient manner).

Instead, try allocating a single vector, and calculating the indices from 3d coordinates, something like:

template<class T>
struct array_3d
{
 using index_t = typename std::vector<T>::size_type;
 array_3d(index_t width, index_t height, index_t depth):
 m_width(width), m_height(height), m_depth(depth),
 m_data(m_width * m_height * m_depth) { }
 T& at(index_t x, index_t y, index_t z)
 {
 return m_data[x + y * m_width + z * m_width * m_height];
 }
 
 T const& at(index_t x, index_t y, index_t z) const
 {
 return m_data[x + y * m_width + z * m_width * m_height];
 }
 index_t m_width, m_height, m_depth;
 std::vector<T> m_data;
};

If we're careful in how we iterate this data structure, we can avoid skipping around in memory, e.g.:

for (int k=0;k<nz;k++)
 for (int j=0;j<ny;j++)
 for (int i=0;i<nx;i++)
 xpStar.at(i, j, k)= x[i] + dt*ug.at(i, j, k);

Every access of xpStar.m_data alters the next element in the array. (So we could even avoid calculating the index from the coordinates and just increment an iterator, which may or may not be faster).

(Note: The way I implemented it above, has the x component innermost (so x+1 is index+1). This is the opposite of your initial code (which has z innermost), but it doesn't really matter which you choose, as long as you loop through the dimensions in the right order when iterating).

 vector<double> BX(4,0.0);
 vector<double> BY(4,0.0);
 vector<double> BZ(4,0.0);
 vector<double> wx(4,0.0);
 vector<double> wy(4,0.0);
 vector<double> wz(4,0.0);

Since these are a fixed size, use std::array<double, 4> instead!

 for (int m=west;m<=est;m++) 
 for (int n=south;n<=north;n++)
 for (int q=front;q<=back;q++){ 
 
 double w=wx[m-west]*wy[n-south]*wz[q-front];
 upStar[i][j][k] += ug[m][n][q]*w;
 vpStar[i][j][k] += vg[m][n][q]*w;
 wpStar[i][j][k] += wg[m][n][q]*w;
 }

Again, it might be faster to split this into separate loops for upStar, vpStar and wpStar (and then precalculate w), e.g.:

 array<double, 4*4*4> w;
 
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 w[m + n*4 + q*4*4] = wx[m]*wy[n]*wz[q];
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 upStar.at(i, j, k) += ug.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 vpStar.at(i, j, k) += vg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 wpStar.at(i, j, k) += wg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];

Question 7

Thank you very much for introducing template<class T> struct array_3d replacing 3-D vectors. It is very nice. I tried it, and it works well. Could you explain 'T& at(index_t x, index_t y, index_t z)' and 'T& at(index_t x, index_t y, index_t z) const'? @user673679

Question 8

I tested the code with separation of loops for upStar, vpStar and wpStar, the calculation speed is not better. However, the way code written is very clear, and helpful.

Question 9

Ah yep, that second at() function should return a const&, my bad.

user673679 user673679 12.2k2 gold badges34 silver badges65 bronze badges · Answer 1 · 2020-10-21 11:41:55Z

(Note: I don't know anything about OMP, so I haven't commented on it!)

double pi = 3.141592653589793;
int nx0 = 400; // number of grid in X direction
...

Constants should be declared const (it helps the programmer, if not the compiler).

for (i=0;i<nx;i++)
 for (j=0;j<ny;j++)
 for (k=0;k<nz;k++)
 {
 ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]); 
 //initial values of velocity components at grid nodes
 vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
 wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
 }

I know this is just the setup phase, but that's a lot of repeated cos and sin calculations for the same values. We could move the calculations into the outer loops, or (better) we could pre-calculate sin(x[n]), cos(x[n]), sin(y[n]), etc. separately.

Iterating over all 3 velocity grids at once is likely to be slower than 3 separate loops to fill each one independently. (Due to cache misses).

vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));

This is quite complicated in terms of memory allocation so it's slow to create (lots of copying of those one and two dimensional vectors) and to iterate (allocated memory is unlikely to be contiguous, so we don't know how to access elements in a cache-efficient manner).

Instead, try allocating a single vector, and calculating the indices from 3d coordinates, something like:

template<class T>
struct array_3d
{
 using index_t = typename std::vector<T>::size_type;
 array_3d(index_t width, index_t height, index_t depth):
 m_width(width), m_height(height), m_depth(depth),
 m_data(m_width * m_height * m_depth) { }
 T& at(index_t x, index_t y, index_t z)
 {
 return m_data[x + y * m_width + z * m_width * m_height];
 }
 
 T const& at(index_t x, index_t y, index_t z) const
 {
 return m_data[x + y * m_width + z * m_width * m_height];
 }
 index_t m_width, m_height, m_depth;
 std::vector<T> m_data;
};

If we're careful in how we iterate this data structure, we can avoid skipping around in memory, e.g.:

for (int k=0;k<nz;k++)
 for (int j=0;j<ny;j++)
 for (int i=0;i<nx;i++)
 xpStar.at(i, j, k)= x[i] + dt*ug.at(i, j, k);

Every access of xpStar.m_data alters the next element in the array. (So we could even avoid calculating the index from the coordinates and just increment an iterator, which may or may not be faster).

(Note: The way I implemented it above, has the x component innermost (so x+1 is index+1). This is the opposite of your initial code (which has z innermost), but it doesn't really matter which you choose, as long as you loop through the dimensions in the right order when iterating).

 vector<double> BX(4,0.0);
 vector<double> BY(4,0.0);
 vector<double> BZ(4,0.0);
 vector<double> wx(4,0.0);
 vector<double> wy(4,0.0);
 vector<double> wz(4,0.0);

Since these are a fixed size, use std::array<double, 4> instead!

 for (int m=west;m<=est;m++) 
 for (int n=south;n<=north;n++)
 for (int q=front;q<=back;q++){ 
 
 double w=wx[m-west]*wy[n-south]*wz[q-front];
 upStar[i][j][k] += ug[m][n][q]*w;
 vpStar[i][j][k] += vg[m][n][q]*w;
 wpStar[i][j][k] += wg[m][n][q]*w;
 }

Again, it might be faster to split this into separate loops for upStar, vpStar and wpStar (and then precalculate w), e.g.:

 array<double, 4*4*4> w;
 
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 w[m + n*4 + q*4*4] = wx[m]*wy[n]*wz[q];
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 upStar.at(i, j, k) += ug.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 vpStar.at(i, j, k) += vg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
 for (int q=0;q!=4;q++)
 for (int n=0;n!=4;n++)
 for (int m=0;m!=4;m++) 
 wpStar.at(i, j, k) += wg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];

Thank you very much for introducing template<class T> struct array_3d replacing 3-D vectors. It is very nice. I tried it, and it works well. Could you explain 'T& at(index_t x, index_t y, index_t z)' and 'T& at(index_t x, index_t y, index_t z) const'? @user673679
I tested the code with separation of loops for upStar, vpStar and wpStar, the calculation speed is not better. However, the way code written is very clear, and helpful.
Ah yep, that second at() function should return a const&, my bad.

Stack Exchange Network

Kernel Interpolation C++

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Kernel Interpolation C++

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions