Radial gradient image generator

Question 1

A function to create a radial gradient from one rgb colour to another and using rayon to improve performance. Are there better way to convert between some of the types / any obvious performance improvements I could implement?

use image::RgbImage;
use rayon::prelude::*;
fn radial_gradient(
 geometry: (i32, i32),
 inner_color: Vec<u8>,
 outer_color: Vec<u8>,
 foreground_size: i32,
) -> RgbImage {
 let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
 let distance = |x: i32, y: i32| (((x).pow(2) + (y).pow(2)) as f64).sqrt();
 // The background will adapt to the foreground size so that the inner_color will be at the edges of the art
 // and not just at the centre of the image
 let max_dist =
 distance((geometry.0 / 2) as i32, (geometry.1 / 2) as i32) - (foreground_size / 2) as f64;
 background
 .par_chunks_exact_mut(3)
 .enumerate()
 .for_each(|(pixel_num, pixel)| {
 let x_dist = i32::try_from(pixel_num).unwrap() % geometry.0 - geometry.0 / 2;
 let y_dist = i32::try_from(pixel_num).unwrap() / geometry.0 - geometry.1 / 2;
 let scaled_dist = (distance(x_dist, y_dist) - (foreground_size / 2) as f64) / max_dist;
 for (i, subpix) in pixel.iter_mut().enumerate() {
 *subpix = ((outer_color[i] as f64 * scaled_dist)
 + (inner_color[i] as f64 * (1.0 - scaled_dist)))
 as u8
 }
 });
 background
}

Question 2

Things I changed:

x * x is faster than x.pow(2)
f64 is way overkill for this, use f32
Replace Vec<u8> inputs with [u8; 3]
Use fused-multiply-add to implement lerp
Move distance to separate function for readability
Move all constant values out of the loop
Pixel is too small for efficient parallelization, parallelize over rows instead. Now that we have .enumerate as row id, use simple counter to get column id.

Here's my benchmark suite:

use image::RgbImage;
use rayon::prelude::*;
fn radial_gradient_orig(
 geometry: (i32, i32),
 inner_color: Vec<u8>,
 outer_color: Vec<u8>,
 foreground_size: i32,
) -> RgbImage {
 let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
 let distance = |x: i32, y: i32| (((x).pow(2) + (y).pow(2)) as f64).sqrt();
 // The background will adapt to the foreground size so that the inner_color will be at the edges of the art
 // and not just at the centre of the image
 let max_dist =
 distance((geometry.0 / 2) as i32, (geometry.1 / 2) as i32) - (foreground_size / 2) as f64;
 background
 .par_chunks_exact_mut(3)
 .enumerate()
 .for_each(|(pixel_num, pixel)| {
 let x_dist = i32::try_from(pixel_num).unwrap() % geometry.0 - geometry.0 / 2;
 let y_dist = i32::try_from(pixel_num).unwrap() / geometry.0 - geometry.1 / 2;
 let scaled_dist = (distance(x_dist, y_dist) - (foreground_size / 2) as f64) / max_dist;
 for (i, subpix) in pixel.iter_mut().enumerate() {
 *subpix = ((outer_color[i] as f64 * scaled_dist)
 + (inner_color[i] as f64 * (1.0 - scaled_dist))) as u8
 }
 });
 background
}
#[inline]
fn lerp(pct: f32, a: f32, b: f32) -> f32 {
 pct.mul_add(b - a, a)
}
#[inline]
fn distance(x: i32, y: i32) -> f32 {
 ((x * x + y * y) as f32).sqrt()
}
fn radial_gradient_improved_1(
 geometry: (u32, u32),
 inner_color: [u8; 3],
 outer_color: [u8; 3],
 foreground_size: u32,
) -> RgbImage {
 let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
 // The background will adapt to the foreground size so that the inner_color will be at the edges of the art
 // and not just at the centre of the image
 let center = (geometry.0 / 2, geometry.1 / 2);
 let foreground_half = (foreground_size / 2) as f32;
 let max_dist = distance(center.0 as i32, center.1 as i32) - foreground_half;
 let inner_color = inner_color.map(|el| el as f32);
 let outer_color = outer_color.map(|el| el as f32);
 background
 .par_chunks_exact_mut(3)
 .enumerate()
 .for_each(|(pixel_num, pixel)| {
 let pixel_num = pixel_num as u32;
 let pos_y = pixel_num / geometry.0;
 let pos_x = pixel_num % geometry.0;
 let dist_x = pos_x as i32 - center.0 as i32;
 let dist_y = pos_y as i32 - center.1 as i32;
 let scaled_dist = (distance(dist_x, dist_y) - foreground_half) / max_dist;
 pixel[0] = lerp(scaled_dist, inner_color[0], outer_color[0]) as u8;
 pixel[1] = lerp(scaled_dist, inner_color[1], outer_color[1]) as u8;
 pixel[2] = lerp(scaled_dist, inner_color[2], outer_color[2]) as u8;
 });
 background
}
fn radial_gradient_improved_2(
 geometry: (u32, u32),
 inner_color: [u8; 3],
 outer_color: [u8; 3],
 foreground_size: u32,
) -> RgbImage {
 let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
 // The background will adapt to the foreground size so that the inner_color will be at the edges of the art
 // and not just at the centre of the image
 let center = (geometry.0 / 2, geometry.1 / 2);
 let foreground_half = (foreground_size / 2) as f32;
 let max_dist = distance(center.0 as i32, center.1 as i32) - foreground_half;
 let one_over_max_dist = 1.0 / max_dist;
 let inner_color = inner_color.map(|el| el as f32);
 let outer_color = outer_color.map(|el| el as f32);
 background
 .par_chunks_exact_mut(3 * geometry.0 as usize)
 .enumerate()
 .for_each(|(pos_y, row)| {
 for pos_x in 0..geometry.0 {
 let dist_x = pos_x as i32 - center.0 as i32;
 let dist_y = pos_y as i32 - center.1 as i32;
 let scaled_dist = (distance(dist_x, dist_y) - foreground_half) * one_over_max_dist;
 let pixel_pos = (pos_x * 3) as usize;
 let pixel = &mut row[pixel_pos..(pixel_pos + 3)];
 pixel[0] = lerp(scaled_dist, inner_color[0], outer_color[0]) as u8;
 pixel[1] = lerp(scaled_dist, inner_color[1], outer_color[1]) as u8;
 pixel[2] = lerp(scaled_dist, inner_color[2], outer_color[2]) as u8;
 }
 });
 background
}
const NUM_ITER: usize = 50;
fn main() {
 {
 let duration = (0..NUM_ITER)
 .into_iter()
 .map(|_| {
 let t = std::time::Instant::now();
 let _img =
 radial_gradient_orig((1300, 1024), vec![255, 128, 0], vec![0, 128, 255], 30);
 t.elapsed()
 })
 .min()
 .unwrap();
 println!("Original: {} ms", duration.as_secs_f32() * 1000.0);
 }
 {
 let duration = (0..NUM_ITER)
 .into_iter()
 .map(|_| {
 let t = std::time::Instant::now();
 let _img =
 radial_gradient_improved_1((1300, 1024), [255, 128, 0], [0, 128, 255], 30);
 t.elapsed()
 })
 .min()
 .unwrap();
 println!("Improved 1: {} ms", duration.as_secs_f32() * 1000.0);
 }
 {
 let duration = (0..NUM_ITER)
 .into_iter()
 .map(|_| {
 let t = std::time::Instant::now();
 let _img =
 radial_gradient_improved_2((1300, 1024), [255, 128, 0], [0, 128, 255], 30);
 t.elapsed()
 })
 .min()
 .unwrap();
 println!("Improved 2: {} ms", duration.as_secs_f32() * 1000.0);
 }
 radial_gradient_orig((1300, 1024), vec![255, 128, 0], vec![0, 128, 255], 30)
 .save("img_orig.bmp")
 .unwrap();
 radial_gradient_improved_1((1300, 1024), [255, 128, 0], [0, 128, 255], 30)
 .save("img_imp1.bmp")
 .unwrap();
 radial_gradient_improved_2((1300, 1024), [255, 128, 0], [0, 128, 255], 30)
 .save("img_imp2.bmp")
 .unwrap();
}

> cargo run --release
Original: 5.9073 ms
Improved 1: 4.4981 ms
Improved 2: 2.6811001 ms

Question 3

Not really... It's usually trial and error. In an educated way, meaning, if possible, you introduce a hyper parameter for the chunk size and then plot speedup vs chunk size to get a feeling for it. But I don't know if rayon already does chunking internally, I don't think it spawned a work packet for every pixel, that would have been way slower. I could imagine that it just splits the work by number of threads and then does workstealing if one thread is faster. So it's really just trial and error.

Question 4

I primarily did it to get rid of the division and modulo operator to compute the pixel coordinate. Division and modulo are amongst the slowest operations that exist.

Question 5

That's also why I introduced the one_over_max_dist variable. Float multiplication is waaay faster than float division.

Question 6

Although compilers nowadays are really smart and apply many of those optimizations automatically, so it comes really down to benchmarking to see what has an effect and what doesn't.

Question 7

To optimize further, i think we would have to step into SSE/AVX territory.

Finomnis FinomnisFinomnis 4463 silver badges6 bronze badges · Accepted Answer · 2022-07-13 23:13:08Z

Things I changed:

x * x is faster than x.pow(2)
f64 is way overkill for this, use f32
Replace Vec<u8> inputs with [u8; 3]
Use fused-multiply-add to implement lerp
Move distance to separate function for readability
Move all constant values out of the loop
Pixel is too small for efficient parallelization, parallelize over rows instead. Now that we have .enumerate as row id, use simple counter to get column id.

Here's my benchmark suite:

use image::RgbImage;
use rayon::prelude::*;
fn radial_gradient_orig(
 geometry: (i32, i32),
 inner_color: Vec<u8>,
 outer_color: Vec<u8>,
 foreground_size: i32,
) -> RgbImage {
 let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
 let distance = |x: i32, y: i32| (((x).pow(2) + (y).pow(2)) as f64).sqrt();
 // The background will adapt to the foreground size so that the inner_color will be at the edges of the art
 // and not just at the centre of the image
 let max_dist =
 distance((geometry.0 / 2) as i32, (geometry.1 / 2) as i32) - (foreground_size / 2) as f64;
 background
 .par_chunks_exact_mut(3)
 .enumerate()
 .for_each(|(pixel_num, pixel)| {
 let x_dist = i32::try_from(pixel_num).unwrap() % geometry.0 - geometry.0 / 2;
 let y_dist = i32::try_from(pixel_num).unwrap() / geometry.0 - geometry.1 / 2;
 let scaled_dist = (distance(x_dist, y_dist) - (foreground_size / 2) as f64) / max_dist;
 for (i, subpix) in pixel.iter_mut().enumerate() {
 *subpix = ((outer_color[i] as f64 * scaled_dist)
 + (inner_color[i] as f64 * (1.0 - scaled_dist))) as u8
 }
 });
 background
}
#[inline]
fn lerp(pct: f32, a: f32, b: f32) -> f32 {
 pct.mul_add(b - a, a)
}
#[inline]
fn distance(x: i32, y: i32) -> f32 {
 ((x * x + y * y) as f32).sqrt()
}
fn radial_gradient_improved_1(
 geometry: (u32, u32),
 inner_color: [u8; 3],
 outer_color: [u8; 3],
 foreground_size: u32,
) -> RgbImage {
 let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
 // The background will adapt to the foreground size so that the inner_color will be at the edges of the art
 // and not just at the centre of the image
 let center = (geometry.0 / 2, geometry.1 / 2);
 let foreground_half = (foreground_size / 2) as f32;
 let max_dist = distance(center.0 as i32, center.1 as i32) - foreground_half;
 let inner_color = inner_color.map(|el| el as f32);
 let outer_color = outer_color.map(|el| el as f32);
 background
 .par_chunks_exact_mut(3)
 .enumerate()
 .for_each(|(pixel_num, pixel)| {
 let pixel_num = pixel_num as u32;
 let pos_y = pixel_num / geometry.0;
 let pos_x = pixel_num % geometry.0;
 let dist_x = pos_x as i32 - center.0 as i32;
 let dist_y = pos_y as i32 - center.1 as i32;
 let scaled_dist = (distance(dist_x, dist_y) - foreground_half) / max_dist;
 pixel[0] = lerp(scaled_dist, inner_color[0], outer_color[0]) as u8;
 pixel[1] = lerp(scaled_dist, inner_color[1], outer_color[1]) as u8;
 pixel[2] = lerp(scaled_dist, inner_color[2], outer_color[2]) as u8;
 });
 background
}
fn radial_gradient_improved_2(
 geometry: (u32, u32),
 inner_color: [u8; 3],
 outer_color: [u8; 3],
 foreground_size: u32,
) -> RgbImage {
 let mut background: RgbImage = RgbImage::new(geometry.0 as u32, geometry.1 as u32);
 // The background will adapt to the foreground size so that the inner_color will be at the edges of the art
 // and not just at the centre of the image
 let center = (geometry.0 / 2, geometry.1 / 2);
 let foreground_half = (foreground_size / 2) as f32;
 let max_dist = distance(center.0 as i32, center.1 as i32) - foreground_half;
 let one_over_max_dist = 1.0 / max_dist;
 let inner_color = inner_color.map(|el| el as f32);
 let outer_color = outer_color.map(|el| el as f32);
 background
 .par_chunks_exact_mut(3 * geometry.0 as usize)
 .enumerate()
 .for_each(|(pos_y, row)| {
 for pos_x in 0..geometry.0 {
 let dist_x = pos_x as i32 - center.0 as i32;
 let dist_y = pos_y as i32 - center.1 as i32;
 let scaled_dist = (distance(dist_x, dist_y) - foreground_half) * one_over_max_dist;
 let pixel_pos = (pos_x * 3) as usize;
 let pixel = &mut row[pixel_pos..(pixel_pos + 3)];
 pixel[0] = lerp(scaled_dist, inner_color[0], outer_color[0]) as u8;
 pixel[1] = lerp(scaled_dist, inner_color[1], outer_color[1]) as u8;
 pixel[2] = lerp(scaled_dist, inner_color[2], outer_color[2]) as u8;
 }
 });
 background
}
const NUM_ITER: usize = 50;
fn main() {
 {
 let duration = (0..NUM_ITER)
 .into_iter()
 .map(|_| {
 let t = std::time::Instant::now();
 let _img =
 radial_gradient_orig((1300, 1024), vec![255, 128, 0], vec![0, 128, 255], 30);
 t.elapsed()
 })
 .min()
 .unwrap();
 println!("Original: {} ms", duration.as_secs_f32() * 1000.0);
 }
 {
 let duration = (0..NUM_ITER)
 .into_iter()
 .map(|_| {
 let t = std::time::Instant::now();
 let _img =
 radial_gradient_improved_1((1300, 1024), [255, 128, 0], [0, 128, 255], 30);
 t.elapsed()
 })
 .min()
 .unwrap();
 println!("Improved 1: {} ms", duration.as_secs_f32() * 1000.0);
 }
 {
 let duration = (0..NUM_ITER)
 .into_iter()
 .map(|_| {
 let t = std::time::Instant::now();
 let _img =
 radial_gradient_improved_2((1300, 1024), [255, 128, 0], [0, 128, 255], 30);
 t.elapsed()
 })
 .min()
 .unwrap();
 println!("Improved 2: {} ms", duration.as_secs_f32() * 1000.0);
 }
 radial_gradient_orig((1300, 1024), vec![255, 128, 0], vec![0, 128, 255], 30)
 .save("img_orig.bmp")
 .unwrap();
 radial_gradient_improved_1((1300, 1024), [255, 128, 0], [0, 128, 255], 30)
 .save("img_imp1.bmp")
 .unwrap();
 radial_gradient_improved_2((1300, 1024), [255, 128, 0], [0, 128, 255], 30)
 .save("img_imp2.bmp")
 .unwrap();
}

> cargo run --release
Original: 5.9073 ms
Improved 1: 4.4981 ms
Improved 2: 2.6811001 ms

Not really... It's usually trial and error. In an educated way, meaning, if possible, you introduce a hyper parameter for the chunk size and then plot speedup vs chunk size to get a feeling for it. But I don't know if rayon already does chunking internally, I don't think it spawned a work packet for every pixel, that would have been way slower. I could imagine that it just splits the work by number of threads and then does workstealing if one thread is faster. So it's really just trial and error.
I primarily did it to get rid of the division and modulo operator to compute the pixel coordinate. Division and modulo are amongst the slowest operations that exist.
That's also why I introduced the one_over_max_dist variable. Float multiplication is waaay faster than float division.
Although compilers nowadays are really smart and apply many of those optimizations automatically, so it comes really down to benchmarking to see what has an effect and what doesn't.
To optimize further, i think we would have to step into SSE/AVX territory.

Stack Exchange Network

Radial gradient image generator

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Radial gradient image generator

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions