Optimizations of C Code - Bilateral Filter

Question 1

I'm working on a nice approximation for the Bilateral Filter.
I have a working code which runs pretty fast yet still I think much can be improved.

The code (C Code, compiles with C99) is given by (See code on Compiler Explorer):

#define _USE_MATH_DEFINES
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <memory.h>
#include <omp.h>
#define OFF 0
#define ON 1
#include <immintrin.h> // AVX
#define SSE_STRIDE 4
#define SSE_ALIGNMENT 16
#define AVX_STRIDE 8
#define AVX_ALIGNMENT 32
#define M_PIf (float)(M_PI)
void ImageConvolutionGaussianKernel(float* mO, float* mI, float* mTmp, int numRows, int numCols, float gaussianStd, int stdToRadiusFactor);
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega);
void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD);
void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL);
void BilateralFilterFastCompressive(float* mO, float* mI, int numRows, int numCols, float spatialStd, float rangeStd, int paramK)
{
 int ii, paramN;
 float paramL, paramTau, *vParamD, *mZ, *mT, paramOmega, *mCOmega, *mSOmega, *mC, *mS, *mCFiltered, *mSFiltered;
 mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
 mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
 mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 memset(mZ, 0.0f, numRows * numCols * sizeof(float));
 // Init Parameters
 paramL = paramK * rangeStd;
 paramTau = paramK / M_PIf;
 paramN = ceilf((paramK * paramK) / M_PIf);
 paramOmega = M_PIf / paramL;
 vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);
 for (ii = 1; ii <= paramN; ii++)
 {
 vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
 }
 InitOmegaArrays(mCOmega, mSOmega, mI, numRows, numCols, paramOmega);
 // Iteration Number 1
 ii = 1;
 ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
 ImageConvolutionGaussianKernel(mSFiltered, mSOmega, mT, numRows, numCols, spatialStd, paramK);
 UpdateArrays(mO, mZ, mCOmega, mSOmega, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
 // Iteration Number 2
 ii = 2;
 InitArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);
 ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
 ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);
 UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
 for (ii = 3; ii <= paramN; ii++)
 {
 UpdateArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);
 ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
 ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);
 UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
 }
 UpdateOutput(mO, mZ, mI, numRows, numCols, rangeStd, paramL);
 _mm_free(mZ);
 _mm_free(mT);
 _mm_free(mC);
 _mm_free(mS);
 _mm_free(mCOmega);
 _mm_free(mSOmega);
 _mm_free(mCFiltered);
 _mm_free(mSFiltered);
 _mm_free(vParamD);
}
// Auxiliary Functions
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega) {
 int ii;
 for (ii = 0; ii < numRows * numCols; ii++)
 {
 mCOmega[ii] = cosf(paramOmega * mI[ii]);
 mSOmega[ii] = sinf(paramOmega * mI[ii]);
 }
}
void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD) {
 int ii;
 for (ii = 0; ii < numRows * numCols; ii++)
 {
 mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
 mZ[ii] += paramD * (mC[ii] * mCFiltered[ii] + mS[ii] * mSFiltered[ii]);
 }
}
void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {
 int ii;
 for (ii = 0; ii < numRows * numCols; ii++)
 {
 mS[ii] = 2.0f * mCOmega[ii] * mSOmega[ii];
 mC[ii] = 2.0f * mCOmega[ii] * mCOmega[ii] - 1.0f;
 }
}
void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {
 int ii;
 float varTmp;
 for (ii = 0; ii < numRows * numCols; ii++)
 {
 varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
 mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
 mS[ii] = varTmp;
 }
}
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {
 int ii;
 float outFactor;
 outFactor = (M_PIf * rangeStd * rangeStd) / paramL;
 for (ii = 0; ii < numRows * numCols; ii++)
 {
 mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
 }
}

Basically few iterations of Gaussian Blur and Element Wise operations.

I'm testing the code on image of size 8000 x 8000 with spatialStd = 5, rangeStd = 10 / 255 and paramK = 5.
This set of parameters means the number of iterations (paramN in the code) is 8 (Gaussian Blur is done twice per iteration).
My Gaussian Blur implementation takes ~0.18 [Sec] per iteration in the settings above when tested independently.

The issues I'm having with the code:

Per iteration, it seems the time all Element Wise operations takes more than the Gaussian Blur. It seems to me something isn't efficient with the code.
Overhead - If I run the code skipping the Gaussian Blur it takes ~1.8 [Sec]. Thinking that 16 x 0.18 = 2.88 [Sec] (16 iterations of the Gaussian Blur) means I should expect run time of ~5 [Sec]. In practice I get ~ 7 [Sec]. It means there is huge overhead somewhere there.

What I've tried:

Writing all small function using AVX2 SIMD intrinsics. Yet it seems I gain only 3-5% over the compiler (I use Intel Compiler).
Using the #pragma vector aligned to force compiler to vectorize the code and assume no aliasing and no alignment issues. It yields results which are 3-5% slower than the hand tuned I did (See 1).
Disabling OpenMP (Didn't improve).
Various compilers (MSVC, GCC 7.3 / 8.1, Intel Compiler 2018 [Was best]). Results above are the best achieved (Intel Compiler). I tried -Ofast and -O3 on GCC. Using O3 on Intel. FP Precision is set to fast.

I'd be happy to get some feedback on that.

Remark
The code is part of a work at University and will be published on GitHub once it is ready.

Question 2

UpdtaeOutput has a typo in the function name. ImageConvolutionGaussianKernel doesn't seem to be defined anywhere, so I couldn't look at how the whole thing compiles.

Question 3

Well, In my defense it was wrong on both sides :-). I fixed it.

Question 4

@PeterCordes, I updated the code - godbolt.org/z/90d7vz.

Question 5

Have you tested on smaller image sizes? I feel like your problem is that you are operating on huge arrays which do not fit in your cpu's cache. If you were to break the images into smaller pieces such that your set of arrays fit into the cache, and then iterate through all the smaller pieces, I think your program would run faster.

Question 6

@JS1, Tiling :-). Well, beside the Gaussian Blur operation all other operations are element wise with linear accessing so I think it is OK on that department. Tiling with the Gaussian Blur inside will be rewriting the code.

Question 7

the posted code does not compile! Please post code that compiles. When compiling, always enable the warnings, then fix those warnings. (for gcc, at a minimum use: -Wall -Wextra -Wconversion -pedantic -std=gnu17 ) Note, other compilers have a different set of options to accomplish the same thing

here is what the compiler outputs when given the posted code:

gcc -ggdb -Wall -Wextra -Wconversion -pedantic -std=gnu11 -c "untitled.c" 
untitled.c: In function ‘BilateralFilterFastCompressive’:
untitled.c:15:18: warning: implicit declaration of function ‘_mm_malloc’ [-Wimplicit-function-declaration]
 mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
 ^~~~~~~~~~
untitled.c:15:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
 ^
untitled.c:15:64: error: ‘AVX_ALIGNMENT’ undeclared (first use in this function)
 mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
 ^~~~~~~~~~~~~
untitled.c:15:64: note: each undeclared identifier is reported only once for each function it appears in
untitled.c:16:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
 ^
untitled.c:17:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:18:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:19:52: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:20:52: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:21:55: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:22:55: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:24:5: warning: implicit declaration of function ‘memset’ [-Wimplicit-function-declaration]
 memset(mZ, 0.0f, numRows * numCols * sizeof(float));
 ^~~~~~
untitled.c:24:5: warning: incompatible implicit declaration of built-in function ‘memset’
untitled.c:24:5: note: include ‘<string.h>’ or provide a declaration of ‘memset’
untitled.c:24:40: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 memset(mZ, 0.0f, numRows * numCols * sizeof(float));
 ^
untitled.c:28:26: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]
 paramL = paramK * rangeStd;
 ^
untitled.c:1:23: error: ‘M_PI’ undeclared (first use in this function); did you mean ‘M_PIf’?
 #define M_PIf (float)(M_PI)
 ^
untitled.c:29:28: note: in expansion of macro ‘M_PIf’
 paramTau = paramK / M_PIf;
 ^~~~~
untitled.c:30:19: warning: implicit declaration of function ‘ceilf’ [-Wimplicit-function-declaration]
 paramN = ceilf((paramK * paramK) / M_PIf);
 ^~~~~
untitled.c:30:19: warning: incompatible implicit declaration of built-in function ‘ceilf’
untitled.c:30:19: note: include ‘<math.h>’ or provide a declaration of ‘ceilf’
untitled.c:33:41: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:37:31: warning: implicit declaration of function ‘expf’ [-Wimplicit-function-declaration]
 vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
 ^~~~
untitled.c:37:31: warning: incompatible implicit declaration of built-in function ‘expf’
untitled.c:37:31: note: include ‘<math.h>’ or provide a declaration of ‘expf’
untitled.c:37:47: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]
 vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
 ^
untitled.c:45:5: warning: implicit declaration of function ‘ImageConvolutionGaussianKernel’ [-Wimplicit-function-declaration]
 ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
untitled.c:71:5: warning: implicit declaration of function ‘_mm_free’ [-Wimplicit-function-declaration]
 _mm_free(mZ);
 ^~~~~~~~
untitled.c: In function ‘InitOmegaArrays’:
untitled.c:91:23: warning: implicit declaration of function ‘cosf’ [-Wimplicit-function-declaration]
 mCOmega[ii] = cosf(paramOmega * mI[ii]);
 ^~~~
untitled.c:91:23: warning: incompatible implicit declaration of built-in function ‘cosf’
untitled.c:91:23: note: include ‘<math.h>’ or provide a declaration of ‘cosf’
untitled.c:92:23: warning: implicit declaration of function ‘sinf’ [-Wimplicit-function-declaration]
 mSOmega[ii] = sinf(paramOmega * mI[ii]);
 ^~~~
untitled.c:92:23: warning: incompatible implicit declaration of built-in function ‘sinf’
untitled.c:92:23: note: include ‘<math.h>’ or provide a declaration of ‘sinf’
untitled.c: In function ‘UpdateArrays’:
untitled.c:104:33: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]
 mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
 ^
untitled.c: In function ‘UpdtaeOutput’:
untitled.c:147:28: error: ‘outFactor’ undeclared (first use in this function)
 mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
 ^~~~~~~~~
untitled.c:141:84: warning: unused parameter ‘rangeStd’ [-Wunused-parameter]
 void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {
 ^~~~~~~~
untitled.c:141:100: warning: unused parameter ‘paramL’ [-Wunused-parameter]
 void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {
 ^~~~~~
Compilation failed.

Of course, if you had posted the #include statements for the needed header files that would have helped a lot.

Strongly suggest honoring the right margin (usually column 72 or 80) by breaking/indenting the lines, similar to:

mSFiltered = (float*)_mm_malloc(
 numRows * numCols * sizeof(float), AVX_ALIGNMENT);

then the code would be much easier to read and understand.

There are a lot of 'random' blank lines in the posted code. For ease of readability and understanding, 1) separate code blocks ( for if else while do...while switch case default via a single blank line. 2) separate functions by 2 or 3 blank lines (be consistent) 3) follow the axiom: only one statement per line and (at most) one variable declaration per statement.

if, by _mm_malloc a call to malloc() is being performed then: 1) in C the returned type is void* which can be assigned to any pointer. Casting just clutters the code, making it more difficult to understand, debug, etc. 2) always check (!=NULL) the returned value to assure the operation was successful. 3) malloc() already allocates at the maximum alignment

To obtain the maximum speed of execution, Suggest setting the optimization level to the max value. I.E. -o3 in gcc

regarding:

for (ii = 0; ii < numRows * numCols; ii++)
{
 varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
 mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
 mS[ii] = varTmp;
}

1) varTmp can be eliminated by assigning the first statement directly to mS[ii]

2) Although the compiler should optimize the for() statement, suggest this:

for (ii = 0; ii < numRows * numCols; ii++)

be changed to:

int size = numRows * numCols;
for (ii = 0; ii < size; ii++)

Question 8

But mS[ii] is used in the calculation for mC[ii] (and the other way around too)

Question 9

Thank you for your answer. _mm_malloc() is an intrinsics function. It works with #include <xmmintrin.h>. It is the most portable (In the meaning works on all 4 big compilers) to allocate aligned memory. I'm not sure regular allocation is good enough for AVX. I compile with -O3 on Intel Compiler. Also with -Ofast on GCC. Your suggestion regading the loop of mS and mCwill yield different result than needed.

Question 10

I compiled the latest posted code, 1) do not change code that you have already posted, Rather post and EDIT. 2) all the calls to _mm_malloc() are causing the compiler to output a warning about the conversion from int to float having the possibility of changing the value. Strongly suggest using malloc() and if you want the array to be initialized to all 0x00 then use calloc()

user3629249 user3629249 2,90811 silver badges10 bronze badges · Answer 1 · 2018-09-09 02:01:34Z

the posted code does not compile! Please post code that compiles. When compiling, always enable the warnings, then fix those warnings. (for gcc, at a minimum use: -Wall -Wextra -Wconversion -pedantic -std=gnu17 ) Note, other compilers have a different set of options to accomplish the same thing

here is what the compiler outputs when given the posted code:

gcc -ggdb -Wall -Wextra -Wconversion -pedantic -std=gnu11 -c "untitled.c" 
untitled.c: In function ‘BilateralFilterFastCompressive’:
untitled.c:15:18: warning: implicit declaration of function ‘_mm_malloc’ [-Wimplicit-function-declaration]
 mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
 ^~~~~~~~~~
untitled.c:15:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
 ^
untitled.c:15:64: error: ‘AVX_ALIGNMENT’ undeclared (first use in this function)
 mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
 ^~~~~~~~~~~~~
untitled.c:15:64: note: each undeclared identifier is reported only once for each function it appears in
untitled.c:16:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
 ^
untitled.c:17:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:18:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:19:52: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:20:52: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:21:55: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:22:55: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:24:5: warning: implicit declaration of function ‘memset’ [-Wimplicit-function-declaration]
 memset(mZ, 0.0f, numRows * numCols * sizeof(float));
 ^~~~~~
untitled.c:24:5: warning: incompatible implicit declaration of built-in function ‘memset’
untitled.c:24:5: note: include ‘<string.h>’ or provide a declaration of ‘memset’
untitled.c:24:40: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 memset(mZ, 0.0f, numRows * numCols * sizeof(float));
 ^
untitled.c:28:26: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]
 paramL = paramK * rangeStd;
 ^
untitled.c:1:23: error: ‘M_PI’ undeclared (first use in this function); did you mean ‘M_PIf’?
 #define M_PIf (float)(M_PI)
 ^
untitled.c:29:28: note: in expansion of macro ‘M_PIf’
 paramTau = paramK / M_PIf;
 ^~~~~
untitled.c:30:19: warning: implicit declaration of function ‘ceilf’ [-Wimplicit-function-declaration]
 paramN = ceilf((paramK * paramK) / M_PIf);
 ^~~~~
untitled.c:30:19: warning: incompatible implicit declaration of built-in function ‘ceilf’
untitled.c:30:19: note: include ‘<math.h>’ or provide a declaration of ‘ceilf’
untitled.c:33:41: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]
 vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);
 ^
untitled.c:37:31: warning: implicit declaration of function ‘expf’ [-Wimplicit-function-declaration]
 vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
 ^~~~
untitled.c:37:31: warning: incompatible implicit declaration of built-in function ‘expf’
untitled.c:37:31: note: include ‘<math.h>’ or provide a declaration of ‘expf’
untitled.c:37:47: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]
 vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
 ^
untitled.c:45:5: warning: implicit declaration of function ‘ImageConvolutionGaussianKernel’ [-Wimplicit-function-declaration]
 ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
untitled.c:71:5: warning: implicit declaration of function ‘_mm_free’ [-Wimplicit-function-declaration]
 _mm_free(mZ);
 ^~~~~~~~
untitled.c: In function ‘InitOmegaArrays’:
untitled.c:91:23: warning: implicit declaration of function ‘cosf’ [-Wimplicit-function-declaration]
 mCOmega[ii] = cosf(paramOmega * mI[ii]);
 ^~~~
untitled.c:91:23: warning: incompatible implicit declaration of built-in function ‘cosf’
untitled.c:91:23: note: include ‘<math.h>’ or provide a declaration of ‘cosf’
untitled.c:92:23: warning: implicit declaration of function ‘sinf’ [-Wimplicit-function-declaration]
 mSOmega[ii] = sinf(paramOmega * mI[ii]);
 ^~~~
untitled.c:92:23: warning: incompatible implicit declaration of built-in function ‘sinf’
untitled.c:92:23: note: include ‘<math.h>’ or provide a declaration of ‘sinf’
untitled.c: In function ‘UpdateArrays’:
untitled.c:104:33: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]
 mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
 ^
untitled.c: In function ‘UpdtaeOutput’:
untitled.c:147:28: error: ‘outFactor’ undeclared (first use in this function)
 mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
 ^~~~~~~~~
untitled.c:141:84: warning: unused parameter ‘rangeStd’ [-Wunused-parameter]
 void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {
 ^~~~~~~~
untitled.c:141:100: warning: unused parameter ‘paramL’ [-Wunused-parameter]
 void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {
 ^~~~~~
Compilation failed.

Of course, if you had posted the #include statements for the needed header files that would have helped a lot.

Strongly suggest honoring the right margin (usually column 72 or 80) by breaking/indenting the lines, similar to:

mSFiltered = (float*)_mm_malloc(
 numRows * numCols * sizeof(float), AVX_ALIGNMENT);

then the code would be much easier to read and understand.

There are a lot of 'random' blank lines in the posted code. For ease of readability and understanding, 1) separate code blocks ( for if else while do...while switch case default via a single blank line. 2) separate functions by 2 or 3 blank lines (be consistent) 3) follow the axiom: only one statement per line and (at most) one variable declaration per statement.

if, by _mm_malloc a call to malloc() is being performed then: 1) in C the returned type is void* which can be assigned to any pointer. Casting just clutters the code, making it more difficult to understand, debug, etc. 2) always check (!=NULL) the returned value to assure the operation was successful. 3) malloc() already allocates at the maximum alignment

To obtain the maximum speed of execution, Suggest setting the optimization level to the max value. I.E. -o3 in gcc

regarding:

for (ii = 0; ii < numRows * numCols; ii++)
{
 varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
 mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
 mS[ii] = varTmp;
}

1) varTmp can be eliminated by assigning the first statement directly to mS[ii]

2) Although the compiler should optimize the for() statement, suggest this:

for (ii = 0; ii < numRows * numCols; ii++)

be changed to:

int size = numRows * numCols;
for (ii = 0; ii < size; ii++)

But mS[ii] is used in the calculation for mC[ii] (and the other way around too)
Thank you for your answer. _mm_malloc() is an intrinsics function. It works with #include <xmmintrin.h>. It is the most portable (In the meaning works on all 4 big compilers) to allocate aligned memory. I'm not sure regular allocation is good enough for AVX. I compile with -O3 on Intel Compiler. Also with -Ofast on GCC. Your suggestion regading the loop of mS and mCwill yield different result than needed.
I compiled the latest posted code, 1) do not change code that you have already posted, Rather post and EDIT. 2) all the calls to _mm_malloc() are causing the compiler to output a warning about the conversion from int to float having the possibility of changing the value. Strongly suggest using malloc() and if you want the array to be initialized to all 0x00 then use calloc()

Stack Exchange Network

Optimizations of C Code - Bilateral Filter

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Optimizations of C Code - Bilateral Filter

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions