I have found that a bottleneck of the OpenCV application I use is the bilinear interpolation, so I have tried to optimize it. The bilinear interpolation is in 8D space, so each "color" is an 8 dimensional vector in [0,255].
I have also written a small benchmarking program.
My code is between 0% and 50% faster than the original one.
I would like to know of any other improvements I could make and also I would like to know if I missed any bugs. And is the performance increase general, and not just specific to my computer architecture?
#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <iostream>
#include <array>
#include <time.h>
using std::cout;
using std::endl;
float random()
{
return static_cast<float>(rand()) / RAND_MAX * 10.0f;
}
template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_original( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
cv::Vec<uchar, cn> interpolateColor;
float x2x1, y2y1, x2x, y2y, yy1, xx1;
x2x1 = x2 - x1;
y2y1 = y2 - y1;
x2x = x2 - x;
y2y = y2 - y;
yy1 = y - y1;
xx1 = x - x1;
const float k = 1.f / (x2x1 * y2y1);
//calculate the interpolation for all the chanes
for (int i =0; i < cn; i++)
{
float interpolation = k * (
q11[i] * x2x * y2y +
q21[i] * xx1 * y2y +
q12[i] * x2x * yy1 +
q22[i] * xx1 * yy1
);
interpolateColor[i] = cvRound(interpolation);
}
return interpolateColor;
}
#include <immintrin.h>
#include <xmmintrin.h>
/**
* Convert 8x32bits values into 8x8bits values. The last 64 bits are zero-ed.
* If values are greather than 255, only low bytes are kept.
**/
__m128i _mm256_convert_epi32_epi8(__m256i _a)
{
// Convert from 'int32' to 'int8'
// Digit = Sample value with one byte length:
// FROM: 1000|2000|3000|4000||||5000|6000|7000|8000 (BIG ENDIAN)
// TO: 1234|0000|0000|0000||||5678|0000|0000|0000
static const __m256i _mask = []() {
std::array<uchar, 32> mask;
std::fill(mask.begin(), mask.end(), -1);
for (int i = 0; i < 4; i++) {
mask[ i] = 4 * i;
mask[16+i] = 4 * i;
}
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mask.data()));
}();
const __m256i _a_shuffled = _mm256_shuffle_epi8(_a, _mask);
const __m128i _low = _mm256_extractf128_si256(_a_shuffled, 0);
__m128i _high = _mm256_extractf128_si256(_a_shuffled, 1);
_high = _mm_slli_epi64(_high, 32);
const __m128i _or = _mm_or_si128(_low, _high);
return _or;
}
template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
alignas(16) std::array<float, 8> C;
const __m256 A_vec = _mm256_set_ps(0, 0, x, y, y2, x2, y2, x2); // Arguments are in reverse
const __m256 B_vec = _mm256_set_ps(0, 0, x1, y1, y, x, y1, x1);
__m256 Sub = _mm256_sub_ps(A_vec, B_vec);
_mm256_store_ps(C.data(), Sub);
const float& x2x1 = C[0];
const float& y2y1 = C[1];
const float& x2x = C[2];
const float& y2y = C[3];
const float& yy1 = C[4];
const float& xx1 = C[5];
// Load each register as an array of 8x the same float32 value which is ks[i]
const __m256 _k11 = _mm256_set1_ps(x2x * y2y);
const __m256 _k21 = _mm256_set1_ps(xx1 * y2y);
const __m256 _k12 = _mm256_set1_ps(x2x * yy1);
const __m256 _k22 = _mm256_set1_ps(xx1 * yy1);
const __m256 _k = _mm256_set1_ps(1.0f / (x2x1 * y2y1)); // Global multiplier coefficient
// Load 8 x 'uint8' (8 * 8 = 64bits) into the 64 low-bits of the register
__m128i _q11_uint8 = _mm_loadu_si64(q11.val);
// Convert from 'uint8' to 'int32'
// Because conversions from integral types to floating types only work with 'int32'
__m256i _q11_int32 = _mm256_cvtepu8_epi32(_q11_uint8);
// Convert from 'int32' to 'float'
__m256 _q11_float32 = _mm256_cvtepi32_ps(_q11_int32);
// Same with q21, q12, and q22
// ######
__m128i _q21_uint8 = _mm_loadu_si64(q21.val);
__m256i _q21_int32 = _mm256_cvtepu8_epi32(_q21_uint8);
__m256 _q21_float32 = _mm256_cvtepi32_ps(_q21_int32);
__m128i _q12_uint8 = _mm_loadu_si64(q12.val);
__m256i _q12_int32 = _mm256_cvtepu8_epi32(_q12_uint8);
__m256 _q12_float32 = _mm256_cvtepi32_ps(_q12_int32);
__m128i _q22_uint8 = _mm_loadu_si64(q22.val);
__m256i _q22_int32 = _mm256_cvtepu8_epi32(_q22_uint8);
__m256 _q22_float32 = _mm256_cvtepi32_ps(_q22_int32);
// ######
__m256 _res = {};
_res = _mm256_fmadd_ps(_k11, _q11_float32, _res);
_res = _mm256_fmadd_ps(_k12, _q12_float32, _res);
_res = _mm256_fmadd_ps(_k21, _q21_float32, _res);
_res = _mm256_fmadd_ps(_k22, _q22_float32, _res);
_res = _mm256_mul_ps(_res, _k);
// Round to nearest int, suppress exceptions
_res = _mm256_round_ps(_res, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
// Convert back from 'float32' to 'int32'
__m256i _resi = _mm256_cvtps_epi32(_res);
// Convert back from 'int32' to 'uint8'
__m128i _trunc = _mm256_convert_epi32_epi8(_resi);
// Convert the packed 'uint8' to cv::Vec
cv::Vec<uchar, cn> interpolateColor;
_mm_storel_epi64(reinterpret_cast<__m128i*>(interpolateColor.val), _trunc);
return interpolateColor;
}
struct TestData
{
cv::Vec<uchar, 8> q11;
cv::Vec<uchar, 8> q12;
cv::Vec<uchar, 8> q21;
cv::Vec<uchar, 8> q22;
float x1, x2, y1, y2, x, y;
};
const int N = 10'000'000;
std::vector<TestData> testData;
template<typename T>
void test(const char* title, T func)
{
clock_t start, end;
double cpu_time_used;
start = clock();
cv::Vec<uchar, 8> res;
int aa(0);
for(int i = 0; i < N; i++)
{
auto& d = testData[i];
res = func(d.q11, d.q12, d.q21, d.q22, d.x1, d.x2, d.y1, d.y2, d.x, d.y);
#if PRINT
for (int i = 0; i < 8; i++)
{
std::cout << (int)res[i] << ", ";
}
std::cout << std::endl;
#endif
for(int i = 0; i < 8; i++) aa += res[i];
}
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
std::cout << aa << std::endl;
std::cout << title << ": " << cpu_time_used << "s" << std::endl;
}
void test()
{
}
int main()
{
test();
testData.reserve(N);
for(int i = 0; i < N; i++)
{
TestData data;
cv::randu(data.q11, 0, 255);
cv::randu(data.q12, 0, 255);
cv::randu(data.q21, 0, 255);
cv::randu(data.q22, 0, 255);
data.x1 = random();
data.x2 = random();
data.y1 = random();
data.y2 = random();
data.x = random();
data.y = random();
testData.push_back(data);
}
test("original", bilinearInterpolation_original<8>);
test("mat", bilinearInterpolation_mat<8>);
return EXIT_SUCCESS;
}
2 Answers 2
Use a SIMD library
While using intrinsics is vastly better than using (inline) assembly, there are still lots of drawbacks: they are not portable to other architectures, and the code is not looking very pretty. You have to do a lot of things manually, like ensuring the alignment of data and converting from OpenCV types to the intrinsic vector types.
Luckily, there are libraries that are built on top of intrinsics that give you a nice and clean C++ experience. For example, Highway is one such library.
If you don't or can't use such a library, then consider writing your own types and functions that abstract away the platform-specific details. Ideally, the vectorized code should look like the non-vectorized code as much as possible, except instead of needing a loop to do the calculations per element of the vector, the operations are done on a whole vector at a time.
Use template specialization for code that only works on one vector size
The function bilinearInterpolation_mat() is templated on the size of the vector. However, the vectorized version only works correctly on vectors of size 8. In that case, make it a specialization of a template:
// Declaration of the template
template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat(cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y);
// Specialization for size 8
template <>
cv::Vec<uchar, 8> bilinearInterpolation_mat(cv::Vec<uchar, 8> q11, cv::Vec<uchar, 8> q12, cv::Vec<uchar, 8> q21, cv::Vec<uchar, 8> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
...
}
Note that you can also have a generic version that is non-vectorized, and then have one or more vectorized specializations for specific vector sizes.
float random()
This is problematic, as my platform has a declaration of POSIX int random() in <cstdlib>, which prevents compilation. It's better to put such functions in a suitable namespace. Or we could even hide it inside main():
int main()
{
auto const random = []{
return static_cast<float>(std::rand() * (10.0 / RAND_MAX));
};
These aliases appear unused:
using std::cout; using std::endl;
There's a lot of unnecessary flushing (std::endl), where plain newline is sufficient.
It's probably better to use std::chrono::steady_clock for timing code, rather than std::clock(). We might even want to look at counting cycles, on CPUs with performance registers.
-
\$\begingroup\$ I get a bit further if I move the call outside the lambda:
static const __m256i _mask = _mm256_loadu_si256([]{ ⋯ return reinterpret_cast<const __m256i*>(mask.data()); }());. But then I get similar inlining error for_mm256_shuffle_epi8(_a, _mask). \$\endgroup\$Toby Speight– Toby Speight2022年10月27日 07:15:37 +00:00Commented Oct 27, 2022 at 7:15 -
\$\begingroup\$ I got access to a system with Skylake processor, and it builds there, under GCC 10, so I guess the function is only inlineable if it's directly supported by the target. Ah, and
-march=skylakegets me building locally, although-march=haswelldoesn't. Hmm. \$\endgroup\$Toby Speight– Toby Speight2022年10月27日 07:18:22 +00:00Commented Oct 27, 2022 at 7:18 -
\$\begingroup\$ On the Skylake machine, output (compiled with
-O3) is1605913439 original: 0.243997s 766172929 mat: 0.223998s, i.e. 8% speed-up. \$\endgroup\$Toby Speight– Toby Speight2022年10月27日 07:22:56 +00:00Commented Oct 27, 2022 at 7:22 -
\$\begingroup\$ If its written
1605913439and766172929, then there is a bug, because the results are different between the intrinsic / non-intrisic version... @Toby Speight \$\endgroup\$rafoo– rafoo2022年11月13日 08:26:39 +00:00Commented Nov 13, 2022 at 8:26 -
1\$\begingroup\$ Thanks @G.Sliepen. What I meant was that although we appear to be using only the C Standard Library, we can get other functions as well. \$\endgroup\$Toby Speight– Toby Speight2022年11月27日 08:41:06 +00:00Commented Nov 27, 2022 at 8:41
_mm256_set_pswhich looks cheap but it's really not, it's quite expensive), and merge some of the packing that happens in the end. Also, is 16-bit fixed-point arithmetic OK? \$\endgroup\$xoryetc, in a way that part of those calculations could be shared. Ideally the data format would be more SoA-like, or AoSoA, but even aTestData*could be worked with. \$\endgroup\$for each pixel(x, y) { long list of sequential actions including bilinear interpolation at one moment }. I don't have written the code, but I have to optimize it. \$\endgroup\$