Why/How is Matlab's circshift function so efficient?

Question 1

I have the following function in Matlab:

function [RawDataKK]=DataCompressKKTest(Data,RXangle,s)
tSize=size(Data,1);
xSize=size(Data,2);
TXSize=size(Data,3);
RXSize=numel(RXangle);
RawDataKK=zeros(tSize,TXSize,RXSize);
DataTemp=zeros(tSize,xSize);
slope = s*sin(RXangle(:))/2;
nShift = round(slope.*((0:xSize-1) - (xSize-1)*(1-sign(RXangle(:)))/2));
for nR=1:RXSize
 % indices = mod((0:tSize-1)' + round(nShift(nR,:)), tSize) + 1;
 for nT=1:TXSize
 for nx=1:xSize
 % DataTemp(indices(:,nx)) = Data(:,nx,nT);
 DataTemp(:,nx)=circshift(Data(:,nx,nT),nShift(nR,nx));
 
% DataTemp(1:nShift(nR,nx),nx) = Data( (tSize-nShift(nR,nx)+1):end,nx,nT);
% DataTemp( (nShift(nR,nx)+1):end,nx) = Data( 1:(tSize-nShift(nR,nx)),nx,nT);
 
 end
 RawDataKK(:,nT,nR)=sum(DataTemp,2);
 end
end
end

I tried implementing it as a C++ function using the Eigen toolbox and the Matlab Data API for C++. In C++, I'm implementing the commented out version which returns identical results and runs at roughly the same speed in Matlab.

#include <iostream>
#include "mex.hpp"
#include "mexAdapter.hpp"
#include <Eigen/Dense>
#include <MatlabDataArray.hpp>
#include <cmath>
#include <math.h>
#include <omp.h>
class MexFunction : public matlab::mex::Function {
public:
 // Factory to create MATLAB data arrays
 matlab::data::ArrayFactory factory;
 
 void operator()(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
 checkArguments(outputs, inputs);
 
 // Initialize input parameters
 int tSize = inputs[0][0];
 int xSize = inputs[0][1];
 int TXSize = inputs[0][2];
 int RXSize = inputs[0][3];
 int RFlen = inputs[0][4];
 
 auto ptr = getDataPtr<std::complex<double>>(inputs[1]);
 Eigen::Map< const Eigen::MatrixXcd > Data( ptr, tSize, TXSize*xSize );
 
 double s = inputs[2][0];
 
 auto ptr2 = getDataPtr<double>(inputs[3]);
 Eigen::Map< const Eigen::VectorXd > RXangle( ptr2, RXSize );
 outputs[0] = factory.createArray<std::complex<double>>({static_cast<size_t>(tSize),static_cast<size_t>(TXSize*RXSize)});
 
 auto ptrRecon = getOutDataPtr<std::complex<double>>(outputs[0]);
 Eigen::Map<Eigen::MatrixXcd> RFDataKK(ptrRecon,tSize,TXSize*RXSize);
 // Get num threads
 int numThreads = omp_get_max_threads();
 int nProc = omp_get_num_procs();
 omp_set_num_threads(nProc*2);
 
 Eigen::MatrixXi nShift = ( (s*RXangle.array().sin()/2).replicate(1,xSize).array() * 
 (Eigen::VectorXd::LinSpaced(xSize,0,xSize-1).transpose().replicate(RXSize,1).array() - 
 ((xSize-1)*((1-RXangle.array().sign())/2)).matrix().replicate(1,xSize).array()) ).round().cast<int>();
 
 Eigen::MatrixXi nShift2 = tSize - nShift.array();
 
 #pragma omp parallel for
 for (int nR = 0; nR < RXSize; nR++) {
 
 Eigen::MatrixXcd shiftedData = Eigen::MatrixXcd::Zero(tSize,TXSize*xSize);
 for (int nx = 0; nx < xSize; nx++) {
 shiftedData( Eigen::seq(0,nShift(nR,nx)-1) , Eigen::seq(nx*TXSize,(nx+1)*TXSize-1)) = Data( Eigen::seq(nShift2(nR,nx),tSize-1) , Eigen::seq(nx*TXSize,(nx+1)*TXSize-1));
 shiftedData( Eigen::seq(nShift(nR,nx),tSize-1) , Eigen::seq(nx*TXSize,(nx+1)*TXSize-1)) = Data( Eigen::seq(0,nShift2(nR,nx)-1) , Eigen::seq(nx*TXSize,(nx+1)*TXSize-1));
 }
 RFDataKK(Eigen::all,Eigen::seq(nR*TXSize,(nR+1)*TXSize-1) ) = shiftedData.reshaped(tSize*TXSize,xSize).rowwise().sum().reshaped(tSize,TXSize);
 }
 }
 void checkArguments(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
 std::shared_ptr<matlab::engine::MATLABEngine> matlabPtr = getEngine();
 matlab::data::ArrayFactory factory;
 if (inputs.size() != 4) {
 matlabPtr->feval(u"error", 
 0, std::vector<matlab::data::Array>({ factory.createScalar("Four inputs required") }));
 }
 if (inputs[0].getNumberOfElements() != 5) {
 matlabPtr->feval(u"error", 
 0, std::vector<matlab::data::Array>({ factory.createScalar("Need 5 input parameters") }));
 }
 
 if (inputs[0].getType() != matlab::data::ArrayType::INT32) {
 matlabPtr->feval(u"error", 
 0, std::vector<matlab::data::Array>({ factory.createScalar("Input parameter must be 32 bit integer") }));
 }
 if (inputs[1].getType() == matlab::data::ArrayType::DOUBLE ||
 inputs[1].getType() != matlab::data::ArrayType::COMPLEX_DOUBLE) {
 matlabPtr->feval(u"error", 
 0, std::vector<matlab::data::Array>({ factory.createScalar("Input RFData must be type complex double") }));
 }
 if (inputs[1].getDimensions().size() != 2) {
 matlabPtr->feval(u"error", 
 0, std::vector<matlab::data::Array>({ factory.createScalar("Input must be m-by-n dimension") }));
 }
 
 // TODO: Implement remaining checks
 }
 template <typename T>
 const T* getDataPtr(matlab::data::Array arr) {
 const matlab::data::TypedArray<T> arr_t = arr;
 matlab::data::TypedIterator<const T> it(arr_t.begin());
 return it.operator->();
 }
 template <typename T>
 T* getOutDataPtr(matlab::data::Array& arr) {
 auto range = matlab::data::getWritableElements<T>(arr);
 return range.begin().operator->();
 }
};

When I perform a speed test in Matlab after compiling with the following parameters (with mingw GCC version 8.3), I find that I have not gained a meaningful speedup for the array sizes I am working with. Why is the speedup only a factor of 2? I am running the code on a machine with 24 cores, I would expect at least an order of magnitude improvement, especially since the code seems practically identical.

Compilation and testing code:

mingwFlags = {'CXXFLAGS="$CXXFLAGS -march=native -std=c++14 -fno-math-errno -ffast-math -fopenmp -DNDEBUG -w -Wno-error"',...
 'LDFLAGS="$LDFLAGS -fopenmp"','CXXOPTIMFLAGS="-O3"'};
% ipath is the location of the eigen library. 
tic; mex(ipath,mingwFlags{1},mingwFlags{2},mingwFlags{3},'CompressKKV2.cpp'); toc
tic; mex(ipath,mingwFlags{1},mingwFlags{2},mingwFlags{3},'CompressKKIndices.cpp'); toc
% some preinitialization and data loading. Dummy data could be included here. The point is to test performance
% Timing test:
nTrials = 100;
T = zeros(nTrials,4);
for i = 1:nTrials
 tic; RawDataKK2 = CompressKKV2(param,cRF(1:param(5),p.ConnMap),s,p.RXangle); 
 RawDataKK2 = reshape(RawDataKK2,[p.szRFframe+1,p.na,p.nRX]); T(i,1) = toc;
 tic; RawDataKK3=DataCompressKKTest(Data,p.RXangle,s); T(i,2) = toc;
end
chkT = mean(T,1)

Timing results:

chkT =
 0.4172 0.8304

The values of the input parameters for my test cases are as follows:

tSize = 2688; xSize = 192; TXSize = 15; RXSize = 16; RFlen = tSize*TXSize;

The rest of the data can take random values for testing performance as long as the datatypes are correct. I think this code will work with other positive values for the above variables as well as long as they remain the same order of magnitude.

Question 2

A low-hanging fruit for performance would be to avoid the repeated allocation of the shiftedData matrix inside the parallel region by splitting the parallel for into a parallel section with a for pragma inside. I also don't get why you zero-initialize the matrix. Are there parts that are not overwritten by the loop over xSize? Your use of Eigen::seq instead of the more common matrix.block(top, left, rows, cols) or matrix.middleRows(top, count) makes this very hard to read, IMHO.

Question 3

The circshift implementation itself seems reasonable to me. It would be better to do it over columns rather than rows but of course that requires transposing somewhere. I looked at the assembly (godbolt) of the central loop (includes a cleaned up, more readable version) and the rowwise sum does not get vectorized due to the reshaping. You might be better off doing the summation on-the-fly without copying into a temporary matrix. Just super simple one scalar per thread per iteration

Question 4

Actually, try the third version here. One particular issue with the rowwise sum is that Eigen uses a naive implementation where each memory access is one full column vector away from the previous one, which leads to poor cache performance. Numpy actually has a similar issue as discussed here. Both have the same solution: Change the iteration order (even if it means more memory accesses) or transpose the arrays

Question 5

You can also coax Eigen into doing the right thing by replacing the rowwise sum with a matrix-vector multiplication with a One vector. But you would still need to replace the reshape with a Map, otherwise Eigen cannot properly analyse the access pattern

Question 6

To help reviewers give you better answers, we need to know what the code is intended to achieve. Please add sufficient context to your question to describe the purpose of the code. We want to know why much more than how. The more you tell us about what your code is for, the easier it will be for reviewers to help you. Also, edit the title to simply summarise the task, rather than your concerns about the code.

Question 7

It's hard to tell what the exact problem is of your code without knowing more about the actual input. Still, here are some generic tips:

More threads don't always help

int nProc = omp_get_num_procs();
omp_set_num_threads(nProc*2);

If you are CPU-bound, then using more threads than you have cores is useless, and will only increase the overhead of starting, stopping and switching between threads. So at the very least, just limit OMP to using nProc threads.

However, you are mostly moving data around (summing numbers is quite trivial for the CPU), so you are probably memory-bound. On most systems, just a few cores are enough to saturate the available memory bandwidth, so you will never see a speedup of 24. Instead of throwing threads at the problem, you should be checking if you can improve the memory access pattern. See below for some ideas. A profiling tool like Linux perf or Intel VTune might be helpful to check if you are memory bound, and/or if you have lots of cache misses.

Unnecessary initialization of temporary matrix

Eigen::MatrixXcd shiftedData = Eigen::MatrixXcd::Zero(tSize,TXSize*xSize);

Inside the outer for-loop you have a temporary matrix. You initialize it with all zeroes, but then you proceed to overwrite all the elements of it in the inner for-loop. The compiler might not see that it can optimize this, so I would just not zero it at all, and write:

Eigen::MatrixXcd shiftedData(tSize,TXSize*xSize);

Avoid the temporary matrix

If shiftedData is large enough then by the time you use it to fill in RFDataKK, it's no longer in the lowest-level caches of your CPU. Ideally, you would avoid the temporary matrix shiftedData, and just compute the elements of RFDataKK directly.

Question 8

So first starting with Cris Luengo's post, I optimized the Matlab function a bit more. At first I thought, why include the temporary, but as per the comments and Cris's point about Matlab's indexing, it appears this is the most efficient or close to the most efficient Matlab implementation:

function [RawDataKK]=DataCompressKKTest2(Data,RXangle,s)
tSize=size(Data,1);
xSize=size(Data,2);
TXSize=size(Data,3);
RXSize=numel(RXangle);
RawDataKK=zeros(tSize,TXSize,RXSize);
slope = s*sin(RXangle(:))/2;
nShift = round(slope.*((0:xSize-1) - (xSize-1)*(1-sign(RXangle(:)))/2));
for nR=1:RXSize
 for nT=1:TXSize
 DataTemp = 0;
 for nx=1:xSize
 DataTemp = DataTemp + circshift(Data(:, nx, nT), nShift(nR, nx));
 
 end
 RawDataKK(:, nT, nR) = DataTemp;
 end
end
end

nShift's size is relatively insignificant [nRX,xSize], so no point transposing it there.

Next as per Homer512's advice and G. Slippen's advice, the initialization of shiftedData was unnecessary. That improved performance by a bit, but what finally got a major improvement in performance was getting rid of the intermediate variable, shiftedData. The way I had previously written my code made it particularly difficult to see how I could accomplish that. Luckily, Homer512's implementations made this much easier to figure out. Here's the final version:

#include <iostream>
#include "mex.hpp"
#include "mexAdapter.hpp"
#include <Eigen/Dense>
#include <MatlabDataArray.hpp>
#include <cmath>
#include <math.h>
#include <omp.h>
// #include "MatlabEngine.hpp"
class MexFunction : public matlab::mex::Function {
public:
 
 // Pointer to MATLAB engine to call fprintf
 std::shared_ptr<matlab::engine::MATLABEngine> matlabPtr = getEngine();
 // Factory to create MATLAB data arrays
 matlab::data::ArrayFactory factory;
 
 void operator()(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
 checkArguments(outputs, inputs);
 
 // Initialize input parameters
 int tSize = inputs[0][0];
 int xSize = inputs[0][1];
 int TXSize = inputs[0][2];
 int RXSize = inputs[0][3];
 int RFlen = inputs[0][4];
 
 auto ptr = getDataPtr<std::complex<double>>(inputs[1]);
 Eigen::Map< const Eigen::MatrixXcd > Data( ptr, tSize, TXSize*xSize );
 
 double s = inputs[2][0];
 
 auto ptr2 = getDataPtr<double>(inputs[3]);
 Eigen::Map< const Eigen::VectorXd > RXangle( ptr2, RXSize );
 outputs[0] = factory.createArray<std::complex<double>>({static_cast<size_t>(tSize),static_cast<size_t>(TXSize*RXSize)});
 
 auto ptrRecon = getOutDataPtr<std::complex<double>>(outputs[0]);
 Eigen::Map<Eigen::MatrixXcd> RFDataKK(ptrRecon,tSize,TXSize*RXSize);
 // Get num threads
 int numThreads = omp_get_max_threads();
 int nProc = omp_get_num_procs();
 omp_set_num_threads(nProc);
 
 Eigen::MatrixXi nShift = ( (s*RXangle.array().sin()/2).replicate(1,xSize).array() * 
 (Eigen::VectorXd::LinSpaced(xSize,0,xSize-1).transpose().replicate(RXSize,1).array() - 
 ((xSize-1)*((1-RXangle.array().sign())/2)).matrix().replicate(1,xSize).array()) ).round().cast<int>();
 
 Eigen::MatrixXi nShift2 = tSize - nShift.array();
 
 #pragma omp parallel for
 for (int nR = 0; nR < RXSize; nR++) {
 
 auto shiftedDataColBlock = Eigen::MatrixXcd::MapAligned(
 RFDataKK.data()+nR*tSize*TXSize, tSize, TXSize);
 
 for (int nx = 0; nx < xSize; nx++) {
 const int nShift_x = nShift(nR,nx);
 const int nShift2_x = nShift2(nR,nx);
 const int bottomSize = tSize-nShift_x;
 const auto dataColBlock = Data.middleCols(nx*TXSize, TXSize);
 shiftedDataColBlock.topRows(nShift_x) += dataColBlock.middleRows(nShift2_x, nShift_x);
 shiftedDataColBlock.bottomRows(bottomSize) += dataColBlock.topRows(bottomSize);
 }
 
 }
 }
 void checkArguments(matlab::mex::ArgumentList outputs, matlab::mex::ArgumentList inputs) {
 std::shared_ptr<matlab::engine::MATLABEngine> matlabPtr = getEngine();
 matlab::data::ArrayFactory factory;
 if (inputs.size() != 4) {
 matlabPtr->feval(u"error", 
 0, std::vector<matlab::data::Array>({ factory.createScalar("Four inputs required") }));
 }
 if (inputs[0].getNumberOfElements() != 5) {
 matlabPtr->feval(u"error", 
 0, std::vector<matlab::data::Array>({ factory.createScalar("Need 5 input parameters") }));
 }
 
 if (inputs[0].getType() != matlab::data::ArrayType::INT32) {
 matlabPtr->feval(u"error", 
 0, std::vector<matlab::data::Array>({ factory.createScalar("Input parameter must be 32 bit integer") }));
 }
 if (inputs[1].getType() == matlab::data::ArrayType::DOUBLE ||
 inputs[1].getType() != matlab::data::ArrayType::COMPLEX_DOUBLE) {
 matlabPtr->feval(u"error", 
 0, std::vector<matlab::data::Array>({ factory.createScalar("Input RFData must be type complex double") }));
 }
 if (inputs[1].getDimensions().size() != 2) {
 matlabPtr->feval(u"error", 
 0, std::vector<matlab::data::Array>({ factory.createScalar("Input must be m-by-n dimension") }));
 }
 
 // TODO: Implement remaining checks
 }
 template <typename T>
 const T* getDataPtr(matlab::data::Array arr) {
 const matlab::data::TypedArray<T> arr_t = arr;
 matlab::data::TypedIterator<const T> it(arr_t.begin());
 return it.operator->();
 }
 template <typename T>
 T* getOutDataPtr(matlab::data::Array& arr) {
 auto range = matlab::data::getWritableElements<T>(arr);
 return range.begin().operator->();
 }
};

Upon testing, this version appears to result in the order of magnitude speed up I was looking for!

chkT =
 0.7511 0.0721

Question 9

Copying data around is typically not something that can be sped up significantly by multithreading (as already stated in the earlier answer).

You can even speed up your MATLAB code by rewriting the inner loop:

for nx=1:xSize
 DataTemp(:,nx)=circshift(Data(:,nx,nT),nShift(nR,nx));
end
RawDataKK(:,nT,nR)=sum(DataTemp,2);

Here you are copying the data to be summed into a temporary array, instead of summing it directly. MATLAB’s indexing operation is relatively slow, so avoiding the extra indexing is always good.

DataTemp = 0;
for nx = 1:xSize
 DataTemp = DataTemp + circshift(Data(:, nx, nT), nShift(nR, nx));
end
RawDataKK(:, nT, nR) = DataTemp;

Other than that, your MATLAB code is quite efficient, you’re iterating in the right order (last index in outer loop, first index in inner loop) and generally doing things correctly. Maybe define nShift to be transposed of what it is now? It depends on how large it is, whether that makes any difference.

Regarding the code style: Even though it is really common to see MATLAB code with very few spaces in it, adding spaces around operators and after commas really helps readability.

Question 10

Clarification: In this case, "the extra indexing" refers to DataTemp(:, nx), correct?

Question 11

@drakon101 Indeed. It’s an extra copy step that is not needed. But copying into an existing array seems to be relatively expensive, I don’t know why. It typically is cheaper to do a = b(:,1) than a(:.1) = b(:,1).

G. Sliepen G. Sliepen 69.1k3 gold badges74 silver badges180 bronze badges · Accepted Answer · 2024-06-05 21:57:22Z

It's hard to tell what the exact problem is of your code without knowing more about the actual input. Still, here are some generic tips:

More threads don't always help

int nProc = omp_get_num_procs();
omp_set_num_threads(nProc*2);

If you are CPU-bound, then using more threads than you have cores is useless, and will only increase the overhead of starting, stopping and switching between threads. So at the very least, just limit OMP to using nProc threads.

However, you are mostly moving data around (summing numbers is quite trivial for the CPU), so you are probably memory-bound. On most systems, just a few cores are enough to saturate the available memory bandwidth, so you will never see a speedup of 24. Instead of throwing threads at the problem, you should be checking if you can improve the memory access pattern. See below for some ideas. A profiling tool like Linux perf or Intel VTune might be helpful to check if you are memory bound, and/or if you have lots of cache misses.

Unnecessary initialization of temporary matrix

Eigen::MatrixXcd shiftedData = Eigen::MatrixXcd::Zero(tSize,TXSize*xSize);

Inside the outer for-loop you have a temporary matrix. You initialize it with all zeroes, but then you proceed to overwrite all the elements of it in the inner for-loop. The compiler might not see that it can optimize this, so I would just not zero it at all, and write:

Eigen::MatrixXcd shiftedData(tSize,TXSize*xSize);

Avoid the temporary matrix

If shiftedData is large enough then by the time you use it to fill in RFDataKK, it's no longer in the lowest-level caches of your CPU. Ideally, you would avoid the temporary matrix shiftedData, and just compute the elements of RFDataKK directly.

Stack Exchange Network

Why/How is Matlab's circshift function so efficient?

3 Answers 3

More threads don't always help

Unnecessary initialization of temporary matrix

Avoid the temporary matrix

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Why/How is Matlab's circshift function so efficient?

3 Answers 3

More threads don't always help

Unnecessary initialization of temporary matrix

Avoid the temporary matrix

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions