This is a follow-up question for An Updated Multi-dimensional Image Data Structure with Variadic Template Functions in C++. To apply pixel operation into an Image
class, I implemented apply_each_pixel
and apply_each_pixel_openmp
template functions to compare the performance between using std::execution::par
and using Openmp. After tested multiple times, I found that apply_each_pixel_openmp
runs faster than apply_each_pixel
under Visual Studio environment on Windows.
Compiler Information:
Microsoft (R) C/C++ Optimizing Compiler Version 19.44.34823.2 for x64 Copyright (C) Microsoft Corporation. All rights reserved.
The experimental implementation
apply_each_pixel
template function implementation// apply_each_pixel template function implementation template<class ExPo, class ElementT, class F, class... Args> requires(std::is_execution_policy_v<std::remove_cvref_t<ExPo>>) constexpr static auto apply_each_pixel(ExPo execution_policy, const Image<ElementT>& input, F operation, Args&&... args) { auto transformed_image_data = recursive_transform<1>(execution_policy, [&](auto&& pixel_input) { return std::invoke(operation, pixel_input, args...); }, input.getImageData()); return Image<std::ranges::range_value_t<decltype(transformed_image_data)>>(transformed_image_data, input.getSize()); }
apply_each_pixel_openmp
template function implementation// apply_each_pixel_openmp template function implementation template<class ElementT, class F, class... Args> constexpr static auto apply_each_pixel_openmp(const Image<ElementT>& input, F operation, Args&&... args) { std::vector<std::invoke_result_t<F, ElementT, Args...>> output_vector; auto input_count = input.count(); auto input_vector = input.getImageData(); output_vector.resize(input_count); #pragma omp parallel for for (std::size_t i = 0; i < input_count; ++i) { output_vector[i] = std::invoke(operation, input_vector[i], args...); } return Image<std::invoke_result_t<F, ElementT>>(output_vector, input.getSize()); }
recursive_transform
template function implementation// recursive_transform template function for the multiple parameters cases (the version with unwrap_level) template<std::size_t unwrap_level = 1, std::copy_constructible F, class Arg1, class... Args> requires(unwrap_level <= recursive_depth<Arg1>()) constexpr auto recursive_transform(const F& f, const Arg1& arg1, const Args&... args) { if constexpr (unwrap_level > 0) { recursive_variadic_invoke_result_t<unwrap_level, F, Arg1, Args...> output{}; transform( std::inserter(output, std::ranges::end(output)), [&f](auto&& element1, auto&&... elements) { return recursive_transform<unwrap_level - 1>(f, element1, elements...); }, std::ranges::cbegin(arg1), std::ranges::cend(arg1), std::ranges::cbegin(args)... ); return output; } else if constexpr (std::regular_invocable<F, Arg1, Args...>) { return std::invoke(f, arg1, args...); } else { static_assert(!std::regular_invocable<F, Arg1, Args...>, "The function passed to recursive_transform() cannot be invoked" "with the element types at the given recursion level."); } } // recursive_transform template function for std::array (the version with unwrap_level) template< std::size_t unwrap_level = 1, template<class, std::size_t> class Container, typename T, std::size_t N, typename F, class... Args> requires (std::ranges::input_range<Container<T, N>>) constexpr auto recursive_transform(const F& f, const Container<T, N>& arg1, const Args&... args) { if constexpr (unwrap_level > 0) { recursive_array_invoke_result_t<unwrap_level, F, Container, T, N, Args...> output{}; transform( std::ranges::begin(output), [&f](auto&& element1, auto&&... elements) { return recursive_transform<unwrap_level - 1>(f, element1, elements...); }, std::ranges::cbegin(arg1), std::ranges::cend(arg1), std::ranges::cbegin(args)... ); return output; } else if constexpr(std::regular_invocable<F, Container<T, N>, Args...>) { return std::invoke(f, arg1, args...); } else { static_assert(!std::regular_invocable<F, Container<T, N>, Args...>, "The function passed to recursive_transform() cannot be invoked" "with the element types at the given recursion level."); } } // recursive_transform template function implementation (the version with unwrap_level, with execution policy) template<std::size_t unwrap_level = 1, class ExPo, class T, class F> requires (std::is_execution_policy_v<std::remove_cvref_t<ExPo>> && unwrap_level <= recursive_depth<T>()) constexpr auto recursive_transform(ExPo execution_policy, const F& f, const T& input) { if constexpr (unwrap_level > 0) { recursive_invoke_result_t<unwrap_level, F, T> output{}; output.resize(input.size()); std::mutex mutex; std::transform(execution_policy, std::ranges::cbegin(input), std::ranges::cend(input), std::ranges::begin(output), [&](auto&& element) { std::lock_guard lock(mutex); return recursive_transform<unwrap_level - 1>(execution_policy, f, element); }); return output; } else if constexpr (std::regular_invocable<F, T>) { return std::invoke(f, input); } else { static_assert(!std::regular_invocable<F, T>, "The function passed to recursive_transform() cannot be invoked" "with the element types at the given recursion level."); } } #ifdef USE_BOOST_ITERATOR #include <boost/iterator/zip_iterator.hpp> // recursive_transform template function for the binary operation cases (the version with unwrap_level, with execution policy) template<std::size_t unwrap_level = 1, class ExPo, class T1, class T2, class F> requires (std::is_execution_policy_v<std::remove_cvref_t<ExPo>> && unwrap_level <= recursive_depth<T>()) constexpr auto recursive_transform(ExPo execution_policy, const F& f, const T1& input1, const T2& input2) { if constexpr (unwrap_level > 0) { recursive_variadic_invoke_result_t<unwrap_level, F, T1, T2> output{}; assert(input1.size() == input2.size()); std::mutex mutex; // Reference: https://stackoverflow.com/a/10457201/6667035 // Reference: https://www.boost.org/doc/libs/1_76_0/libs/iterator/doc/zip_iterator.html std::for_each(execution_policy, boost::make_zip_iterator( boost::make_tuple(std::ranges::cbegin(input1), std::ranges::cbegin(input2)) ), boost::make_zip_iterator( boost::make_tuple(std::ranges::cend(input1), std::ranges::cend(input2)) ), [&](auto&& elements) { auto result = recursive_transform<unwrap_level - 1>(execution_policy, f, boost::get<0>(elements), boost::get<1>(elements)); std::lock_guard lock(mutex); output.emplace_back(std::move(result)); } ); return output; } else { return std::invoke(f, input1, input2); } } #endif
The usage of apply_each_pixel
and apply_each_pixel_openmp
template functions:
/* Developed by Jimmy Hu */
#include <chrono>
#include "../base_types.h"
#include "../basic_functions.h"
#include "../image.h"
#include "../image_io.h"
#include "../image_operations.h"
template<class ExPo, class ElementT>
requires (std::is_execution_policy_v<std::remove_cvref_t<ExPo>>)
constexpr static auto Processing(
ExPo execution_policy,
const TinyDIP::Image<ElementT>& input,
std::ostream& os = std::cout
)
{
auto hsv_image = TinyDIP::rgb2hsv(execution_policy, input);
auto start1 = std::chrono::system_clock::now();
auto processed_hsv_image1 = TinyDIP::apply_each_pixel(
std::execution::par,
hsv_image,
[&](TinyDIP::HSV pixel) -> TinyDIP::HSV
{
TinyDIP::HSV pixel_for_filling;
pixel_for_filling.channels[0] = 0;
pixel_for_filling.channels[1] = 1;
pixel_for_filling.channels[2] = 255;
if (pixel.channels[2] > 100 && pixel.channels[2] < 220)
{
pixel = pixel_for_filling;
}
return pixel;
}
);
auto end1 = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds1 = end1 - start1;
os << "elapsed time with using parallel execution policy: " << elapsed_seconds1.count() << '\n';
auto start2 = std::chrono::system_clock::now();
auto processed_hsv_image2 = TinyDIP::apply_each_pixel_openmp(hsv_image,
[&](TinyDIP::HSV pixel) -> TinyDIP::HSV
{
TinyDIP::HSV pixel_for_filling;
pixel_for_filling.channels[0] = 0;
pixel_for_filling.channels[1] = 1;
pixel_for_filling.channels[2] = 255;
if (pixel.channels[2] > 100 && pixel.channels[2] < 220)
{
pixel = pixel_for_filling;
}
return pixel;
});
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds2 = end2 - start2;
os << "elapsed time with using Openmp: " << elapsed_seconds2.count() << '\n';
os << TinyDIP::sum(TinyDIP::difference(processed_hsv_image1, processed_hsv_image2)) << '\n';
auto output_image = TinyDIP::hsv2rgb(execution_policy, processed_hsv_image1);
return output_image;
}
int main()
{
auto start = std::chrono::system_clock::now();
std::string image_filename = "1.bmp";
auto image_input = TinyDIP::bmp_read(image_filename.c_str(), true);
image_input = TinyDIP::copyResizeBicubic(image_input, 3 * image_input.getWidth(), 3 * image_input.getHeight());
TinyDIP::bmp_write("BeforeProcessing", image_input);
auto output_image = Processing(std::execution::seq, image_input);
TinyDIP::bmp_write("AfterProcessing", output_image);
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
std::time_t end_time = std::chrono::system_clock::to_time_t(end);
std::cout << "Computation finished at " << std::ctime(&end_time) << "elapsed time: " << elapsed_seconds.count() << '\n';
return EXIT_SUCCESS;
}
The output of the test code above:
Width of the input image: 454
Height of the input image: 341
Size of the input image(Byte): 464442
elapsed time with using parallel execution policy: 0.142722
elapsed time with using Openmp: 0.0333061
{0, 0, 0}
Computation finished at Thu Feb 13 14:23:46 2025
elapsed time: 0.563649
All suggestions are welcome.
All suggestions are welcome.
The summary information:
Which question it is a follow-up to?
An Updated Multi-dimensional Image Data Structure with Variadic Template Functions in C++
What changes has been made in the code since last question?
I implemented
apply_each_pixel
andapply_each_pixel_openmp
template functions in this post.Why a new review is being asked for?
Please review the implementation of
apply_each_pixel
andapply_each_pixel_openmp
template functions and its tests.
1 Answer 1
recursive_transform
is entirely unnecessary. If the openmp version can do it with a single loop, why can't the other one?
template<class ExPo, class ElementT, class F, class... Args>
requires(std::is_execution_policy_v<std::remove_cvref_t<ExPo>>)
constexpr static auto apply_each_pixel(ExPo execution_policy, const Image<ElementT>& input, F operation, Args&&... args)
{
std::vector<std::invoke_result_t<F, ElementT, Args...>> output_vector;
auto input_count = input.count();
auto input_vector = input.getImageData();
output_vector.resize(input_count);
std::transform(execution_policy, input_vector.begin(), input_vector.end(), output_vector.begin(), [&](auto& elem) { return std::invoke(operation, elem, args...); });
return Image<std::invoke_result_t<F, ElementT>>(output_vector, input.getSize());
}
You absolutely must not lock a mutex within an operation you pass to std::transform
when you are passing an arbitrary execution policy, because it may be one of the unsequenced ones.
Unsequenced execution policies are the only case where function calls are unsequenced with respect to each other, meaning they can be interleaved. In all other situations in C++, they are indeterminately-sequenced (cannot interleave). Because of that, users are not allowed to allocate or deallocate memory, acquire mutexes, use non-lockfree
std::atomic
specializations, or, in general, perform any vectorization-unsafe operations when using these policies (vectorization-unsafe functions are the ones that synchronize-with another function, e.g.std::mutex::unlock
synchronizes-with the nextstd::mutex::lock
).
std::par
one is using complicated recursion down to the point of doing a single pixel in each task. That will generate many more tasks than are needed (so much more task creation and scheduling cost). Can you not use a simple parallel loop in the C++ code (maybestd::for_each(std::execution::par_unseq,...)
? \$\endgroup\$