I'm working on a nice approximation for the Bilateral Filter.
I have a working code which runs pretty fast yet still I think much can be improved.
The code (C
Code, compiles with C99
) is given by (See code on Compiler Explorer):
#define _USE_MATH_DEFINES
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <memory.h>
#include <omp.h>
#define OFF 0
#define ON 1
#include <immintrin.h> // AVX
#define SSE_STRIDE 4
#define SSE_ALIGNMENT 16
#define AVX_STRIDE 8
#define AVX_ALIGNMENT 32
#define M_PIf (float)(M_PI)
void ImageConvolutionGaussianKernel(float* mO, float* mI, float* mTmp, int numRows, int numCols, float gaussianStd, int stdToRadiusFactor);
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega);
void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD);
void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL);
void BilateralFilterFastCompressive(float* mO, float* mI, int numRows, int numCols, float spatialStd, float rangeStd, int paramK)
{
int ii, paramN;
float paramL, paramTau, *vParamD, *mZ, *mT, paramOmega, *mCOmega, *mSOmega, *mC, *mS, *mCFiltered, *mSFiltered;
mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
memset(mZ, 0.0f, numRows * numCols * sizeof(float));
// Init Parameters
paramL = paramK * rangeStd;
paramTau = paramK / M_PIf;
paramN = ceilf((paramK * paramK) / M_PIf);
paramOmega = M_PIf / paramL;
vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);
for (ii = 1; ii <= paramN; ii++)
{
vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
}
InitOmegaArrays(mCOmega, mSOmega, mI, numRows, numCols, paramOmega);
// Iteration Number 1
ii = 1;
ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
ImageConvolutionGaussianKernel(mSFiltered, mSOmega, mT, numRows, numCols, spatialStd, paramK);
UpdateArrays(mO, mZ, mCOmega, mSOmega, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
// Iteration Number 2
ii = 2;
InitArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);
ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);
UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
for (ii = 3; ii <= paramN; ii++)
{
UpdateArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);
ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);
UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
}
UpdateOutput(mO, mZ, mI, numRows, numCols, rangeStd, paramL);
_mm_free(mZ);
_mm_free(mT);
_mm_free(mC);
_mm_free(mS);
_mm_free(mCOmega);
_mm_free(mSOmega);
_mm_free(mCFiltered);
_mm_free(mSFiltered);
_mm_free(vParamD);
}
// Auxiliary Functions
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega) {
int ii;
for (ii = 0; ii < numRows * numCols; ii++)
{
mCOmega[ii] = cosf(paramOmega * mI[ii]);
mSOmega[ii] = sinf(paramOmega * mI[ii]);
}
}
void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD) {
int ii;
for (ii = 0; ii < numRows * numCols; ii++)
{
mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
mZ[ii] += paramD * (mC[ii] * mCFiltered[ii] + mS[ii] * mSFiltered[ii]);
}
}
void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {
int ii;
for (ii = 0; ii < numRows * numCols; ii++)
{
mS[ii] = 2.0f * mCOmega[ii] * mSOmega[ii];
mC[ii] = 2.0f * mCOmega[ii] * mCOmega[ii] - 1.0f;
}
}
void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {
int ii;
float varTmp;
for (ii = 0; ii < numRows * numCols; ii++)
{
varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
mS[ii] = varTmp;
}
}
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {
int ii;
float outFactor;
outFactor = (M_PIf * rangeStd * rangeStd) / paramL;
for (ii = 0; ii < numRows * numCols; ii++)
{
mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
}
}
Basically few iterations of Gaussian Blur and Element Wise operations.
I'm testing the code on image of size 8000 x 8000
with spatialStd = 5
, rangeStd = 10 / 255
and paramK = 5
.
This set of parameters means the number of iterations (paramN
in the code) is 8 (Gaussian Blur is done twice per iteration).
My Gaussian Blur implementation takes ~0.18 [Sec] per iteration in the settings above when tested independently.
The issues I'm having with the code:
- Per iteration, it seems the time all Element Wise operations takes more than the Gaussian Blur. It seems to me something isn't efficient with the code.
- Overhead - If I run the code skipping the Gaussian Blur it takes ~1.8 [Sec]. Thinking that 16 x 0.18 = 2.88 [Sec] (16 iterations of the Gaussian Blur) means I should expect run time of ~5 [Sec]. In practice I get ~ 7 [Sec]. It means there is huge overhead somewhere there.
What I've tried:
- Writing all small function using AVX2 SIMD intrinsics. Yet it seems I gain only 3-5% over the compiler (I use Intel Compiler).
- Using the
#pragma vector aligned
to force compiler to vectorize the code and assume no aliasing and no alignment issues. It yields results which are 3-5% slower than the hand tuned I did (See 1). - Disabling OpenMP (Didn't improve).
- Various compilers (MSVC, GCC 7.3 / 8.1, Intel Compiler 2018 [Was best]). Results above are the best achieved (Intel Compiler). I tried
-Ofast
and-O3
on GCC. UsingO3
on Intel. FP Precision is set to fast.
I'd be happy to get some feedback on that.
Remark
The code is part of a work at University and will be published on GitHub once it is ready.
1 Answer 1
the posted code does not compile! Please post code that compiles. When compiling, always enable the warnings, then fix those warnings. (for gcc
, at a minimum use: -Wall -Wextra -Wconversion -pedantic -std=gnu17
) Note, other compilers have a different set of options to accomplish the same thing
here is what the compiler outputs when given the posted code:
gcc -ggdb -Wall -Wextra -Wconversion -pedantic -std=gnu11 -c "untitled.c"
untitled.c: In function ‘BilateralFilterFastCompressive’:
untitled.c:15:18: warning: implicit declaration of function ‘_mm_malloc’ [-Wimplicit-function-declaration]
mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
^~~~~~~~~~
untitled.c:15:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
^
untitled.c:15:64: error: ‘AVX_ALIGNMENT’ undeclared (first use in this function)
mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
^~~~~~~~~~~~~
untitled.c:15:64: note: each undeclared identifier is reported only once for each function it appears in
untitled.c:16:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
^
untitled.c:17:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
^
untitled.c:18:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
^
untitled.c:19:52: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
^
untitled.c:20:52: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
^
untitled.c:21:55: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
^
untitled.c:22:55: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
^
untitled.c:24:5: warning: implicit declaration of function ‘memset’ [-Wimplicit-function-declaration]
memset(mZ, 0.0f, numRows * numCols * sizeof(float));
^~~~~~
untitled.c:24:5: warning: incompatible implicit declaration of built-in function ‘memset’
untitled.c:24:5: note: include ‘<string.h>’ or provide a declaration of ‘memset’
untitled.c:24:40: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
memset(mZ, 0.0f, numRows * numCols * sizeof(float));
^
untitled.c:28:26: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]
paramL = paramK * rangeStd;
^
untitled.c:1:23: error: ‘M_PI’ undeclared (first use in this function); did you mean ‘M_PIf’?
#define M_PIf (float)(M_PI)
^
untitled.c:29:28: note: in expansion of macro ‘M_PIf’
paramTau = paramK / M_PIf;
^~~~~
untitled.c:30:19: warning: implicit declaration of function ‘ceilf’ [-Wimplicit-function-declaration]
paramN = ceilf((paramK * paramK) / M_PIf);
^~~~~
untitled.c:30:19: warning: incompatible implicit declaration of built-in function ‘ceilf’
untitled.c:30:19: note: include ‘<math.h>’ or provide a declaration of ‘ceilf’
untitled.c:33:41: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);
^
untitled.c:37:31: warning: implicit declaration of function ‘expf’ [-Wimplicit-function-declaration]
vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
^~~~
untitled.c:37:31: warning: incompatible implicit declaration of built-in function ‘expf’
untitled.c:37:31: note: include ‘<math.h>’ or provide a declaration of ‘expf’
untitled.c:37:47: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]
vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
^
untitled.c:45:5: warning: implicit declaration of function ‘ImageConvolutionGaussianKernel’ [-Wimplicit-function-declaration]
ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
untitled.c:71:5: warning: implicit declaration of function ‘_mm_free’ [-Wimplicit-function-declaration]
_mm_free(mZ);
^~~~~~~~
untitled.c: In function ‘InitOmegaArrays’:
untitled.c:91:23: warning: implicit declaration of function ‘cosf’ [-Wimplicit-function-declaration]
mCOmega[ii] = cosf(paramOmega * mI[ii]);
^~~~
untitled.c:91:23: warning: incompatible implicit declaration of built-in function ‘cosf’
untitled.c:91:23: note: include ‘<math.h>’ or provide a declaration of ‘cosf’
untitled.c:92:23: warning: implicit declaration of function ‘sinf’ [-Wimplicit-function-declaration]
mSOmega[ii] = sinf(paramOmega * mI[ii]);
^~~~
untitled.c:92:23: warning: incompatible implicit declaration of built-in function ‘sinf’
untitled.c:92:23: note: include ‘<math.h>’ or provide a declaration of ‘sinf’
untitled.c: In function ‘UpdateArrays’:
untitled.c:104:33: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]
mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
^
untitled.c: In function ‘UpdtaeOutput’:
untitled.c:147:28: error: ‘outFactor’ undeclared (first use in this function)
mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
^~~~~~~~~
untitled.c:141:84: warning: unused parameter ‘rangeStd’ [-Wunused-parameter]
void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {
^~~~~~~~
untitled.c:141:100: warning: unused parameter ‘paramL’ [-Wunused-parameter]
void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {
^~~~~~
Compilation failed.
Of course, if you had posted the #include
statements for the needed header files that would have helped a lot.
Strongly suggest honoring the right margin (usually column 72 or 80) by breaking/indenting the lines, similar to:
mSFiltered = (float*)_mm_malloc(
numRows * numCols * sizeof(float), AVX_ALIGNMENT);
then the code would be much easier to read and understand.
There are a lot of 'random' blank lines in the posted code. For ease of readability and understanding, 1) separate code blocks ( for
if
else
while
do...while
switch
case
default
via a single blank line. 2) separate functions by 2 or 3 blank lines (be consistent) 3) follow the axiom: only one statement per line and (at most) one variable declaration per statement.
if, by _mm_malloc
a call to malloc()
is being performed then: 1) in C the returned type is void*
which can be assigned to any pointer. Casting just clutters the code, making it more difficult to understand, debug, etc. 2) always check (!=NULL) the returned value to assure the operation was successful. 3) malloc()
already allocates at the maximum alignment
To obtain the maximum speed of execution, Suggest setting the optimization level to the max value. I.E. -o3
in gcc
regarding:
for (ii = 0; ii < numRows * numCols; ii++)
{
varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
mS[ii] = varTmp;
}
1) varTmp can be eliminated by assigning the first statement directly to mS[ii]
2) Although the compiler should optimize the for()
statement, suggest this:
for (ii = 0; ii < numRows * numCols; ii++)
be changed to:
int size = numRows * numCols;
for (ii = 0; ii < size; ii++)
-
2\$\begingroup\$ But
mS[ii]
is used in the calculation formC[ii]
(and the other way around too) \$\endgroup\$user555045– user5550452018年09月09日 02:08:51 +00:00Commented Sep 9, 2018 at 2:08 -
1\$\begingroup\$ Thank you for your answer.
_mm_malloc()
is an intrinsics function. It works with#include <xmmintrin.h>
. It is the most portable (In the meaning works on all 4 big compilers) to allocate aligned memory. I'm not sure regular allocation is good enough for AVX. I compile with-O3
on Intel Compiler. Also with-Ofast
on GCC. Your suggestion regading the loop ofmS
andmC
will yield different result than needed. \$\endgroup\$Royi– Royi2018年09月09日 07:11:36 +00:00Commented Sep 9, 2018 at 7:11 -
\$\begingroup\$ I compiled the latest posted code, 1) do not change code that you have already posted, Rather post and EDIT. 2) all the calls to
_mm_malloc()
are causing the compiler to output a warning about the conversion fromint
tofloat
having the possibility of changing the value. Strongly suggest usingmalloc()
and if you want the array to be initialized to all0x00
then usecalloc()
\$\endgroup\$user3629249– user36292492018年09月10日 03:28:07 +00:00Commented Sep 10, 2018 at 3:28
Explore related questions
See similar questions with these tags.
UpdtaeOutput
has a typo in the function name.ImageConvolutionGaussianKernel
doesn't seem to be defined anywhere, so I couldn't look at how the whole thing compiles. \$\endgroup\$