\$\begingroup\$
\$\endgroup\$
A function to create a radial gradient from one rgb colour to another and using rayon to improve performance. Are there better way to convert between some of the types / any obvious performance improvements I could implement?
use image::RgbImage;
use rayon::prelude::*;
fn radial_gradient(
geometry: (i32, i32),
inner_color: Vec<u8>,
outer_color: Vec<u8>,
foreground_size: i32,
) -> RgbImage {
let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
let distance = |x: i32, y: i32| (((x).pow(2) + (y).pow(2)) as f64).sqrt();
// The background will adapt to the foreground size so that the inner_color will be at the edges of the art
// and not just at the centre of the image
let max_dist =
distance((geometry.0 / 2) as i32, (geometry.1 / 2) as i32) - (foreground_size / 2) as f64;
background
.par_chunks_exact_mut(3)
.enumerate()
.for_each(|(pixel_num, pixel)| {
let x_dist = i32::try_from(pixel_num).unwrap() % geometry.0 - geometry.0 / 2;
let y_dist = i32::try_from(pixel_num).unwrap() / geometry.0 - geometry.1 / 2;
let scaled_dist = (distance(x_dist, y_dist) - (foreground_size / 2) as f64) / max_dist;
for (i, subpix) in pixel.iter_mut().enumerate() {
*subpix = ((outer_color[i] as f64 * scaled_dist)
+ (inner_color[i] as f64 * (1.0 - scaled_dist)))
as u8
}
});
background
}
200_success
145k22 gold badges190 silver badges478 bronze badges
1 Answer 1
\$\begingroup\$
\$\endgroup\$
7
Things I changed:
x * x
is faster thanx.pow(2)
f64
is way overkill for this, usef32
- Replace
Vec<u8>
inputs with[u8; 3]
- Use fused-multiply-add to implement
lerp
- Move
distance
to separate function for readability - Move all constant values out of the loop
- Pixel is too small for efficient parallelization, parallelize over rows instead. Now that we have
.enumerate
as row id, use simple counter to get column id.
Here's my benchmark suite:
use image::RgbImage;
use rayon::prelude::*;
fn radial_gradient_orig(
geometry: (i32, i32),
inner_color: Vec<u8>,
outer_color: Vec<u8>,
foreground_size: i32,
) -> RgbImage {
let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
let distance = |x: i32, y: i32| (((x).pow(2) + (y).pow(2)) as f64).sqrt();
// The background will adapt to the foreground size so that the inner_color will be at the edges of the art
// and not just at the centre of the image
let max_dist =
distance((geometry.0 / 2) as i32, (geometry.1 / 2) as i32) - (foreground_size / 2) as f64;
background
.par_chunks_exact_mut(3)
.enumerate()
.for_each(|(pixel_num, pixel)| {
let x_dist = i32::try_from(pixel_num).unwrap() % geometry.0 - geometry.0 / 2;
let y_dist = i32::try_from(pixel_num).unwrap() / geometry.0 - geometry.1 / 2;
let scaled_dist = (distance(x_dist, y_dist) - (foreground_size / 2) as f64) / max_dist;
for (i, subpix) in pixel.iter_mut().enumerate() {
*subpix = ((outer_color[i] as f64 * scaled_dist)
+ (inner_color[i] as f64 * (1.0 - scaled_dist))) as u8
}
});
background
}
#[inline]
fn lerp(pct: f32, a: f32, b: f32) -> f32 {
pct.mul_add(b - a, a)
}
#[inline]
fn distance(x: i32, y: i32) -> f32 {
((x * x + y * y) as f32).sqrt()
}
fn radial_gradient_improved_1(
geometry: (u32, u32),
inner_color: [u8; 3],
outer_color: [u8; 3],
foreground_size: u32,
) -> RgbImage {
let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
// The background will adapt to the foreground size so that the inner_color will be at the edges of the art
// and not just at the centre of the image
let center = (geometry.0 / 2, geometry.1 / 2);
let foreground_half = (foreground_size / 2) as f32;
let max_dist = distance(center.0 as i32, center.1 as i32) - foreground_half;
let inner_color = inner_color.map(|el| el as f32);
let outer_color = outer_color.map(|el| el as f32);
background
.par_chunks_exact_mut(3)
.enumerate()
.for_each(|(pixel_num, pixel)| {
let pixel_num = pixel_num as u32;
let pos_y = pixel_num / geometry.0;
let pos_x = pixel_num % geometry.0;
let dist_x = pos_x as i32 - center.0 as i32;
let dist_y = pos_y as i32 - center.1 as i32;
let scaled_dist = (distance(dist_x, dist_y) - foreground_half) / max_dist;
pixel[0] = lerp(scaled_dist, inner_color[0], outer_color[0]) as u8;
pixel[1] = lerp(scaled_dist, inner_color[1], outer_color[1]) as u8;
pixel[2] = lerp(scaled_dist, inner_color[2], outer_color[2]) as u8;
});
background
}
fn radial_gradient_improved_2(
geometry: (u32, u32),
inner_color: [u8; 3],
outer_color: [u8; 3],
foreground_size: u32,
) -> RgbImage {
let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
// The background will adapt to the foreground size so that the inner_color will be at the edges of the art
// and not just at the centre of the image
let center = (geometry.0 / 2, geometry.1 / 2);
let foreground_half = (foreground_size / 2) as f32;
let max_dist = distance(center.0 as i32, center.1 as i32) - foreground_half;
let one_over_max_dist = 1.0 / max_dist;
let inner_color = inner_color.map(|el| el as f32);
let outer_color = outer_color.map(|el| el as f32);
background
.par_chunks_exact_mut(3 * geometry.0 as usize)
.enumerate()
.for_each(|(pos_y, row)| {
for pos_x in 0..geometry.0 {
let dist_x = pos_x as i32 - center.0 as i32;
let dist_y = pos_y as i32 - center.1 as i32;
let scaled_dist = (distance(dist_x, dist_y) - foreground_half) * one_over_max_dist;
let pixel_pos = (pos_x * 3) as usize;
let pixel = &mut row[pixel_pos..(pixel_pos + 3)];
pixel[0] = lerp(scaled_dist, inner_color[0], outer_color[0]) as u8;
pixel[1] = lerp(scaled_dist, inner_color[1], outer_color[1]) as u8;
pixel[2] = lerp(scaled_dist, inner_color[2], outer_color[2]) as u8;
}
});
background
}
const NUM_ITER: usize = 50;
fn main() {
{
let duration = (0..NUM_ITER)
.into_iter()
.map(|_| {
let t = std::time::Instant::now();
let _img =
radial_gradient_orig((1300, 1024), vec![255, 128, 0], vec![0, 128, 255], 30);
t.elapsed()
})
.min()
.unwrap();
println!("Original: {} ms", duration.as_secs_f32() * 1000.0);
}
{
let duration = (0..NUM_ITER)
.into_iter()
.map(|_| {
let t = std::time::Instant::now();
let _img =
radial_gradient_improved_1((1300, 1024), [255, 128, 0], [0, 128, 255], 30);
t.elapsed()
})
.min()
.unwrap();
println!("Improved 1: {} ms", duration.as_secs_f32() * 1000.0);
}
{
let duration = (0..NUM_ITER)
.into_iter()
.map(|_| {
let t = std::time::Instant::now();
let _img =
radial_gradient_improved_2((1300, 1024), [255, 128, 0], [0, 128, 255], 30);
t.elapsed()
})
.min()
.unwrap();
println!("Improved 2: {} ms", duration.as_secs_f32() * 1000.0);
}
radial_gradient_orig((1300, 1024), vec![255, 128, 0], vec![0, 128, 255], 30)
.save("img_orig.bmp")
.unwrap();
radial_gradient_improved_1((1300, 1024), [255, 128, 0], [0, 128, 255], 30)
.save("img_imp1.bmp")
.unwrap();
radial_gradient_improved_2((1300, 1024), [255, 128, 0], [0, 128, 255], 30)
.save("img_imp2.bmp")
.unwrap();
}
> cargo run --release
Original: 5.9073 ms
Improved 1: 4.4981 ms
Improved 2: 2.6811001 ms
answered Jul 13, 2022 at 23:13
-
1\$\begingroup\$ Not really... It's usually trial and error. In an educated way, meaning, if possible, you introduce a hyper parameter for the chunk size and then plot speedup vs chunk size to get a feeling for it. But I don't know if rayon already does chunking internally, I don't think it spawned a work packet for every pixel, that would have been way slower. I could imagine that it just splits the work by number of threads and then does workstealing if one thread is faster. So it's really just trial and error. \$\endgroup\$Finomnis– Finomnis2022年07月15日 08:02:30 +00:00Commented Jul 15, 2022 at 8:02
-
1\$\begingroup\$ I primarily did it to get rid of the division and modulo operator to compute the pixel coordinate. Division and modulo are amongst the slowest operations that exist. \$\endgroup\$Finomnis– Finomnis2022年07月15日 08:03:58 +00:00Commented Jul 15, 2022 at 8:03
-
1\$\begingroup\$ That's also why I introduced the
one_over_max_dist
variable. Float multiplication is waaay faster than float division. \$\endgroup\$Finomnis– Finomnis2022年07月15日 08:07:24 +00:00Commented Jul 15, 2022 at 8:07 -
1\$\begingroup\$ Although compilers nowadays are really smart and apply many of those optimizations automatically, so it comes really down to benchmarking to see what has an effect and what doesn't. \$\endgroup\$Finomnis– Finomnis2022年07月15日 08:08:30 +00:00Commented Jul 15, 2022 at 8:08
-
1\$\begingroup\$ To optimize further, i think we would have to step into SSE/AVX territory. \$\endgroup\$Finomnis– Finomnis2022年07月15日 08:13:03 +00:00Commented Jul 15, 2022 at 8:13
lang-rust