I have the following function in Matlab:
function [RawDataKK]=DataCompressKKTest(Data,RXangle,s)
tSize=size(Data,1);
xSize=size(Data,2);
TXSize=size(Data,3);
RXSize=numel(RXangle);
RawDataKK=zeros(tSize,TXSize,RXSize);
DataTemp=zeros(tSize,xSize);
slope = s*sin(RXangle(:))/2;
nShift = round(slope.*((0:xSize-1) - (xSize-1)*(1-sign(RXangle(:)))/2));
for nR=1:RXSize
% indices = mod((0:tSize-1)' + round(nShift(nR,:)), tSize) + 1;
for nT=1:TXSize
for nx=1:xSize
% DataTemp(indices(:,nx)) = Data(:,nx,nT);
DataTemp(:,nx)=circshift(Data(:,nx,nT),nShift(nR,nx));
% DataTemp(1:nShift(nR,nx),nx) = Data( (tSize-nShift(nR,nx)+1):end,nx,nT);
% DataTemp( (nShift(nR,nx)+1):end,nx) = Data( 1:(tSize-nShift(nR,nx)),nx,nT);
end
RawDataKK(:,nT,nR)=sum(DataTemp,2);
end
end
end
I tried implementing it as a C++ function using the Eigen toolbox and the Matlab Data API for C++. In C++, I'm implementing the commented out version which returns identical results and runs at roughly the same speed in Matlab.
#include <iostream>
#include "mex.hpp"
#include "mexAdapter.hpp"
#include <Eigen/Dense>
#include <MatlabDataArray.hpp>
#include <cmath>
#include <math.h>
#include <omp.h>
class MexFunction : public matlab::mex::Function {
public:
// Factory to create MATLAB data arrays
matlab::data::ArrayFactory factory;
void operator()(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
checkArguments(outputs, inputs);
// Initialize input parameters
int tSize = inputs[0][0];
int xSize = inputs[0][1];
int TXSize = inputs[0][2];
int RXSize = inputs[0][3];
int RFlen = inputs[0][4];
auto ptr = getDataPtr<std::complex<double>>(inputs[1]);
Eigen::Map< const Eigen::MatrixXcd > Data( ptr, tSize, TXSize*xSize );
double s = inputs[2][0];
auto ptr2 = getDataPtr<double>(inputs[3]);
Eigen::Map< const Eigen::VectorXd > RXangle( ptr2, RXSize );
outputs[0] = factory.createArray<std::complex<double>>({static_cast<size_t>(tSize),static_cast<size_t>(TXSize*RXSize)});
auto ptrRecon = getOutDataPtr<std::complex<double>>(outputs[0]);
Eigen::Map<Eigen::MatrixXcd> RFDataKK(ptrRecon,tSize,TXSize*RXSize);
// Get num threads
int numThreads = omp_get_max_threads();
int nProc = omp_get_num_procs();
omp_set_num_threads(nProc*2);
Eigen::MatrixXi nShift = ( (s*RXangle.array().sin()/2).replicate(1,xSize).array() *
(Eigen::VectorXd::LinSpaced(xSize,0,xSize-1).transpose().replicate(RXSize,1).array() -
((xSize-1)*((1-RXangle.array().sign())/2)).matrix().replicate(1,xSize).array()) ).round().cast<int>();
Eigen::MatrixXi nShift2 = tSize - nShift.array();
#pragma omp parallel for
for (int nR = 0; nR < RXSize; nR++) {
Eigen::MatrixXcd shiftedData = Eigen::MatrixXcd::Zero(tSize,TXSize*xSize);
for (int nx = 0; nx < xSize; nx++) {
shiftedData( Eigen::seq(0,nShift(nR,nx)-1) , Eigen::seq(nx*TXSize,(nx+1)*TXSize-1)) = Data( Eigen::seq(nShift2(nR,nx),tSize-1) , Eigen::seq(nx*TXSize,(nx+1)*TXSize-1));
shiftedData( Eigen::seq(nShift(nR,nx),tSize-1) , Eigen::seq(nx*TXSize,(nx+1)*TXSize-1)) = Data( Eigen::seq(0,nShift2(nR,nx)-1) , Eigen::seq(nx*TXSize,(nx+1)*TXSize-1));
}
RFDataKK(Eigen::all,Eigen::seq(nR*TXSize,(nR+1)*TXSize-1) ) = shiftedData.reshaped(tSize*TXSize,xSize).rowwise().sum().reshaped(tSize,TXSize);
}
}
void checkArguments(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
std::shared_ptr<matlab::engine::MATLABEngine> matlabPtr = getEngine();
matlab::data::ArrayFactory factory;
if (inputs.size() != 4) {
matlabPtr->feval(u"error",
0, std::vector<matlab::data::Array>({ factory.createScalar("Four inputs required") }));
}
if (inputs[0].getNumberOfElements() != 5) {
matlabPtr->feval(u"error",
0, std::vector<matlab::data::Array>({ factory.createScalar("Need 5 input parameters") }));
}
if (inputs[0].getType() != matlab::data::ArrayType::INT32) {
matlabPtr->feval(u"error",
0, std::vector<matlab::data::Array>({ factory.createScalar("Input parameter must be 32 bit integer") }));
}
if (inputs[1].getType() == matlab::data::ArrayType::DOUBLE ||
inputs[1].getType() != matlab::data::ArrayType::COMPLEX_DOUBLE) {
matlabPtr->feval(u"error",
0, std::vector<matlab::data::Array>({ factory.createScalar("Input RFData must be type complex double") }));
}
if (inputs[1].getDimensions().size() != 2) {
matlabPtr->feval(u"error",
0, std::vector<matlab::data::Array>({ factory.createScalar("Input must be m-by-n dimension") }));
}
// TODO: Implement remaining checks
}
template <typename T>
const T* getDataPtr(matlab::data::Array arr) {
const matlab::data::TypedArray<T> arr_t = arr;
matlab::data::TypedIterator<const T> it(arr_t.begin());
return it.operator->();
}
template <typename T>
T* getOutDataPtr(matlab::data::Array& arr) {
auto range = matlab::data::getWritableElements<T>(arr);
return range.begin().operator->();
}
};
When I perform a speed test in Matlab after compiling with the following parameters (with mingw GCC version 8.3), I find that I have not gained a meaningful speedup for the array sizes I am working with. Why is the speedup only a factor of 2? I am running the code on a machine with 24 cores, I would expect at least an order of magnitude improvement, especially since the code seems practically identical.
Compilation and testing code:
mingwFlags = {'CXXFLAGS="$CXXFLAGS -march=native -std=c++14 -fno-math-errno -ffast-math -fopenmp -DNDEBUG -w -Wno-error"',...
'LDFLAGS="$LDFLAGS -fopenmp"','CXXOPTIMFLAGS="-O3"'};
% ipath is the location of the eigen library.
tic; mex(ipath,mingwFlags{1},mingwFlags{2},mingwFlags{3},'CompressKKV2.cpp'); toc
tic; mex(ipath,mingwFlags{1},mingwFlags{2},mingwFlags{3},'CompressKKIndices.cpp'); toc
% some preinitialization and data loading. Dummy data could be included here. The point is to test performance
% Timing test:
nTrials = 100;
T = zeros(nTrials,4);
for i = 1:nTrials
tic; RawDataKK2 = CompressKKV2(param,cRF(1:param(5),p.ConnMap),s,p.RXangle);
RawDataKK2 = reshape(RawDataKK2,[p.szRFframe+1,p.na,p.nRX]); T(i,1) = toc;
tic; RawDataKK3=DataCompressKKTest(Data,p.RXangle,s); T(i,2) = toc;
end
chkT = mean(T,1)
Timing results:
chkT =
0.4172 0.8304
The values of the input parameters for my test cases are as follows:
tSize = 2688; xSize = 192; TXSize = 15; RXSize = 16; RFlen = tSize*TXSize;
The rest of the data can take random values for testing performance as long as the datatypes are correct. I think this code will work with other positive values for the above variables as well as long as they remain the same order of magnitude.
3 Answers 3
It's hard to tell what the exact problem is of your code without knowing more about the actual input. Still, here are some generic tips:
More threads don't always help
int nProc = omp_get_num_procs(); omp_set_num_threads(nProc*2);
If you are CPU-bound, then using more threads than you have cores is useless, and will only increase the overhead of starting, stopping and switching between threads. So at the very least, just limit OMP to using nProc
threads.
However, you are mostly moving data around (summing numbers is quite trivial for the CPU), so you are probably memory-bound. On most systems, just a few cores are enough to saturate the available memory bandwidth, so you will never see a speedup of 24. Instead of throwing threads at the problem, you should be checking if you can improve the memory access pattern. See below for some ideas. A profiling tool like Linux perf or Intel VTune might be helpful to check if you are memory bound, and/or if you have lots of cache misses.
Unnecessary initialization of temporary matrix
Eigen::MatrixXcd shiftedData = Eigen::MatrixXcd::Zero(tSize,TXSize*xSize);
Inside the outer for
-loop you have a temporary matrix. You initialize it with all zeroes, but then you proceed to overwrite all the elements of it in the inner for
-loop. The compiler might not see that it can optimize this, so I would just not zero it at all, and write:
Eigen::MatrixXcd shiftedData(tSize,TXSize*xSize);
Avoid the temporary matrix
If shiftedData
is large enough then by the time you use it to fill in RFDataKK
, it's no longer in the lowest-level caches of your CPU. Ideally, you would avoid the temporary matrix shiftedData
, and just compute the elements of RFDataKK
directly.
So first starting with Cris Luengo's post, I optimized the Matlab function a bit more. At first I thought, why include the temporary, but as per the comments and Cris's point about Matlab's indexing, it appears this is the most efficient or close to the most efficient Matlab implementation:
function [RawDataKK]=DataCompressKKTest2(Data,RXangle,s)
tSize=size(Data,1);
xSize=size(Data,2);
TXSize=size(Data,3);
RXSize=numel(RXangle);
RawDataKK=zeros(tSize,TXSize,RXSize);
slope = s*sin(RXangle(:))/2;
nShift = round(slope.*((0:xSize-1) - (xSize-1)*(1-sign(RXangle(:)))/2));
for nR=1:RXSize
for nT=1:TXSize
DataTemp = 0;
for nx=1:xSize
DataTemp = DataTemp + circshift(Data(:, nx, nT), nShift(nR, nx));
end
RawDataKK(:, nT, nR) = DataTemp;
end
end
end
nShift's size is relatively insignificant [nRX,xSize], so no point transposing it there.
Next as per Homer512's advice and G. Slippen's advice, the initialization of shiftedData was unnecessary. That improved performance by a bit, but what finally got a major improvement in performance was getting rid of the intermediate variable, shiftedData. The way I had previously written my code made it particularly difficult to see how I could accomplish that. Luckily, Homer512's implementations made this much easier to figure out. Here's the final version:
#include <iostream>
#include "mex.hpp"
#include "mexAdapter.hpp"
#include <Eigen/Dense>
#include <MatlabDataArray.hpp>
#include <cmath>
#include <math.h>
#include <omp.h>
// #include "MatlabEngine.hpp"
class MexFunction : public matlab::mex::Function {
public:
// Pointer to MATLAB engine to call fprintf
std::shared_ptr<matlab::engine::MATLABEngine> matlabPtr = getEngine();
// Factory to create MATLAB data arrays
matlab::data::ArrayFactory factory;
void operator()(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
checkArguments(outputs, inputs);
// Initialize input parameters
int tSize = inputs[0][0];
int xSize = inputs[0][1];
int TXSize = inputs[0][2];
int RXSize = inputs[0][3];
int RFlen = inputs[0][4];
auto ptr = getDataPtr<std::complex<double>>(inputs[1]);
Eigen::Map< const Eigen::MatrixXcd > Data( ptr, tSize, TXSize*xSize );
double s = inputs[2][0];
auto ptr2 = getDataPtr<double>(inputs[3]);
Eigen::Map< const Eigen::VectorXd > RXangle( ptr2, RXSize );
outputs[0] = factory.createArray<std::complex<double>>({static_cast<size_t>(tSize),static_cast<size_t>(TXSize*RXSize)});
auto ptrRecon = getOutDataPtr<std::complex<double>>(outputs[0]);
Eigen::Map<Eigen::MatrixXcd> RFDataKK(ptrRecon,tSize,TXSize*RXSize);
// Get num threads
int numThreads = omp_get_max_threads();
int nProc = omp_get_num_procs();
omp_set_num_threads(nProc);
Eigen::MatrixXi nShift = ( (s*RXangle.array().sin()/2).replicate(1,xSize).array() *
(Eigen::VectorXd::LinSpaced(xSize,0,xSize-1).transpose().replicate(RXSize,1).array() -
((xSize-1)*((1-RXangle.array().sign())/2)).matrix().replicate(1,xSize).array()) ).round().cast<int>();
Eigen::MatrixXi nShift2 = tSize - nShift.array();
#pragma omp parallel for
for (int nR = 0; nR < RXSize; nR++) {
auto shiftedDataColBlock = Eigen::MatrixXcd::MapAligned(
RFDataKK.data()+nR*tSize*TXSize, tSize, TXSize);
for (int nx = 0; nx < xSize; nx++) {
const int nShift_x = nShift(nR,nx);
const int nShift2_x = nShift2(nR,nx);
const int bottomSize = tSize-nShift_x;
const auto dataColBlock = Data.middleCols(nx*TXSize, TXSize);
shiftedDataColBlock.topRows(nShift_x) += dataColBlock.middleRows(nShift2_x, nShift_x);
shiftedDataColBlock.bottomRows(bottomSize) += dataColBlock.topRows(bottomSize);
}
}
}
void checkArguments(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
std::shared_ptr<matlab::engine::MATLABEngine> matlabPtr = getEngine();
matlab::data::ArrayFactory factory;
if (inputs.size() != 4) {
matlabPtr->feval(u"error",
0, std::vector<matlab::data::Array>({ factory.createScalar("Four inputs required") }));
}
if (inputs[0].getNumberOfElements() != 5) {
matlabPtr->feval(u"error",
0, std::vector<matlab::data::Array>({ factory.createScalar("Need 5 input parameters") }));
}
if (inputs[0].getType() != matlab::data::ArrayType::INT32) {
matlabPtr->feval(u"error",
0, std::vector<matlab::data::Array>({ factory.createScalar("Input parameter must be 32 bit integer") }));
}
if (inputs[1].getType() == matlab::data::ArrayType::DOUBLE ||
inputs[1].getType() != matlab::data::ArrayType::COMPLEX_DOUBLE) {
matlabPtr->feval(u"error",
0, std::vector<matlab::data::Array>({ factory.createScalar("Input RFData must be type complex double") }));
}
if (inputs[1].getDimensions().size() != 2) {
matlabPtr->feval(u"error",
0, std::vector<matlab::data::Array>({ factory.createScalar("Input must be m-by-n dimension") }));
}
// TODO: Implement remaining checks
}
template <typename T>
const T* getDataPtr(matlab::data::Array arr) {
const matlab::data::TypedArray<T> arr_t = arr;
matlab::data::TypedIterator<const T> it(arr_t.begin());
return it.operator->();
}
template <typename T>
T* getOutDataPtr(matlab::data::Array& arr) {
auto range = matlab::data::getWritableElements<T>(arr);
return range.begin().operator->();
}
};
Upon testing, this version appears to result in the order of magnitude speed up I was looking for!
chkT =
0.7511 0.0721
Copying data around is typically not something that can be sped up significantly by multithreading (as already stated in the earlier answer).
You can even speed up your MATLAB code by rewriting the inner loop:
for nx=1:xSize
DataTemp(:,nx)=circshift(Data(:,nx,nT),nShift(nR,nx));
end
RawDataKK(:,nT,nR)=sum(DataTemp,2);
Here you are copying the data to be summed into a temporary array, instead of summing it directly. MATLAB’s indexing operation is relatively slow, so avoiding the extra indexing is always good.
DataTemp = 0;
for nx = 1:xSize
DataTemp = DataTemp + circshift(Data(:, nx, nT), nShift(nR, nx));
end
RawDataKK(:, nT, nR) = DataTemp;
Other than that, your MATLAB code is quite efficient, you’re iterating in the right order (last index in outer loop, first index in inner loop) and generally doing things correctly. Maybe define nShift
to be transposed of what it is now? It depends on how large it is, whether that makes any difference.
Regarding the code style: Even though it is really common to see MATLAB code with very few spaces in it, adding spaces around operators and after commas really helps readability.
-
\$\begingroup\$ Clarification: In this case, "the extra indexing" refers to
DataTemp(:, nx)
, correct? \$\endgroup\$drakon101– drakon1012024年06月06日 15:19:23 +00:00Commented Jun 6, 2024 at 15:19 -
1\$\begingroup\$ @drakon101 Indeed. It’s an extra copy step that is not needed. But copying into an existing array seems to be relatively expensive, I don’t know why. It typically is cheaper to do
a = b(:,1)
thana(:.1) = b(:,1)
. \$\endgroup\$Cris Luengo– Cris Luengo2024年06月06日 15:22:37 +00:00Commented Jun 6, 2024 at 15:22
shiftedData
matrix inside the parallel region by splitting theparallel for
into aparallel
section with afor
pragma inside. I also don't get why you zero-initialize the matrix. Are there parts that are not overwritten by the loop overxSize
? Your use ofEigen::seq
instead of the more commonmatrix.block(top, left, rows, cols)
ormatrix.middleRows(top, count)
makes this very hard to read, IMHO. \$\endgroup\$circshift
implementation itself seems reasonable to me. It would be better to do it over columns rather than rows but of course that requires transposing somewhere. I looked at the assembly (godbolt) of the central loop (includes a cleaned up, more readable version) and the rowwise sum does not get vectorized due to the reshaping. You might be better off doing the summation on-the-fly without copying into a temporary matrix. Just super simple one scalar per thread per iteration \$\endgroup\$reshape
with aMap
, otherwise Eigen cannot properly analyse the access pattern \$\endgroup\$