Speed up function using cython

Question 1

I am trying to speed up one of my functions.

def get_scale_local_maximas(cube_coordinates, laplacian_cube):
"""
Check provided cube coordinate for scale space local maximas.
Returns only the points that satisfy the criteria.
A point is considered to be a local maxima if its value is greater
than the value of the point on the next scale level and the point
on the previous scale level. If the tested point is located on the
first scale level or on the last one, then only one inequality should
hold in order for this point to be local scale maxima.
Parameters
----------
cube_coordinates : (n, 3) ndarray
 A 2d array with each row representing 3 values, ``(y,x,scale_level)``
 where ``(y,x)`` are coordinates of the blob and ``scale_level`` is the
 position of a point in scale space.
laplacian_cube : ndarray of floats
 Laplacian of Gaussian scale space. 
Returns
-------
output : (n, 3) ndarray
 cube_coordinates that satisfy the local maximum criteria in
 scale space.
Examples
--------
>>> one = np.array([[1, 2, 3], [4, 5, 6]])
>>> two = np.array([[7, 8, 9], [10, 11, 12]])
>>> three = np.array([[0, 0, 0], [0, 0, 0]])
>>> check_coords = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 2]])
>>> lapl_dummy = np.dstack([one, two, three])
>>> get_scale_local_maximas(check_coords, lapl_dummy)
array([[1, 0, 1]])
"""
amount_of_layers = laplacian_cube.shape[2]
amount_of_points = cube_coordinates.shape[0]
# Preallocate index. Fill it with False.
accepted_points_index = np.ones(amount_of_points, dtype=bool)
for point_index, interest_point_coords in enumerate(cube_coordinates):
 # Row coordinate
 y_coord = interest_point_coords[0]
 # Column coordinate
 x_coord = interest_point_coords[1]
 # Layer number starting from the smallest sigma
 point_layer = interest_point_coords[2]
 point_response = laplacian_cube[y_coord, x_coord, point_layer]
 # Check the point under the current one
 if point_layer != 0:
 lower_point_response = laplacian_cube[y_coord, x_coord, point_layer-1]
 if lower_point_response >= point_response:
 accepted_points_index[point_index] = False
 continue
 # Check the point above the current one
 if point_layer != (amount_of_layers-1):
 upper_point_response = laplacian_cube[y_coord, x_coord, point_layer+1]
 if upper_point_response >= point_response:
 accepted_points_index[point_index] = False
 continue
# Return only accepted points
return cube_coordinates[accepted_points_index]

This is my attempt to speed it up using Cython:

# cython: cdivision=True
# cython: boundscheck=False
# cython: nonecheck=False
# cython: wraparound=False
import numpy as np
cimport numpy as cnp
def get_scale_local_maximas(cube_coordinates, cnp.ndarray[cnp.double_t, ndim=3] laplacian_cube):
"""
Check provided cube coordinate for scale space local maximas.
Returns only the points that satisfy the criteria.
A point is considered to be a local maxima if its value is greater
than the value of the point on the next scale level and the point
on the previous scale level. If the tested point is located on the
first scale level or on the last one, then only one inequality should
hold in order for this point to be local scale maxima.
Parameters
----------
cube_coordinates : (n, 3) ndarray
 A 2d array with each row representing 3 values, ``(y,x,scale_level)``
 where ``(y,x)`` are coordinates of the blob and ``scale_level`` is the
 position of a point in scale space.
laplacian_cube : ndarray of floats
 Laplacian of Gaussian scale space. 
Returns
-------
output : (n, 3) ndarray
 cube_coordinates that satisfy the local maximum criteria in
 scale space.
Examples
--------
>>> one = np.array([[1, 2, 3], [4, 5, 6]])
>>> two = np.array([[7, 8, 9], [10, 11, 12]])
>>> three = np.array([[0, 0, 0], [0, 0, 0]])
>>> check_coords = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 2]])
>>> lapl_dummy = np.dstack([one, two, three])
>>> get_scale_local_maximas(check_coords, lapl_dummy)
array([[1, 0, 1]])
"""
cdef Py_ssize_t y_coord, x_coord, point_layer, point_index
cdef cnp.double_t point_response, lower_point_response, upper_point_response
cdef Py_ssize_t amount_of_layers = laplacian_cube.shape[2]
cdef Py_ssize_t amount_of_points = cube_coordinates.shape[0]
# amount_of_layers = laplacian_cube.shape[2]
# amount_of_points = cube_coordinates.shape[0]
# Preallocate index. Fill it with False.
accepted_points_index = np.ones(amount_of_points, dtype=bool)
for point_index in range(amount_of_points):
 interest_point_coords = cube_coordinates[point_index]
 # Row coordinate
 y_coord = interest_point_coords[0]
 # Column coordinate
 x_coord = interest_point_coords[1]
 # Layer number starting from the smallest sigma
 point_layer = interest_point_coords[2]
 point_response = laplacian_cube[y_coord, x_coord, point_layer]
 # Check the point under the current one
 if point_layer != 0:
 lower_point_response = laplacian_cube[y_coord, x_coord, point_layer-1]
 if lower_point_response >= point_response:
 accepted_points_index[point_index] = False
 continue
 # Check the point above the current one
 if point_layer != (amount_of_layers-1):
 upper_point_response = laplacian_cube[y_coord, x_coord, point_layer+1]
 if upper_point_response >= point_response:
 accepted_points_index[point_index] = False
 continue
# Return only accepted points
return cube_coordinates[accepted_points_index]

But I can see no gain in the speed. And also I tried to replace cnp.ndarray[cnp.double_t, ndim=3] with memoryview cnp.double_t[:, :, ::1] but it only slowed down the whole code. I will appreciate any hints or corrections to my code. I am relatively new to Cython and I may have done something wrong.

Edit:

I fully rewrote my function in Cython:

def get_scale_local_maximas(cnp.ndarray[cnp.int_t, ndim=2] cube_coordinates, cnp.ndarray[cnp.double_t, ndim=3] laplacian_cube):
"""
Check provided cube coordinate for scale space local maximas.
Returns only the points that satisfy the criteria.
A point is considered to be a local maxima if its value is greater
than the value of the point on the next scale level and the point
on the previous scale level. If the tested point is located on the
first scale level or on the last one, then only one inequality should
hold in order for this point to be local scale maxima.
Parameters
----------
cube_coordinates : (n, 3) ndarray
 A 2d array with each row representing 3 values, ``(y,x,scale_level)``
 where ``(y,x)`` are coordinates of the blob and ``scale_level`` is the
 position of a point in scale space.
laplacian_cube : ndarray of floats
 Laplacian of Gaussian scale space. 
Returns
-------
output : (n, 3) ndarray
 cube_coordinates that satisfy the local maximum criteria in
 scale space.
Examples
--------
>>> one = np.array([[1, 2, 3], [4, 5, 6]])
>>> two = np.array([[7, 8, 9], [10, 11, 12]])
>>> three = np.array([[0, 0, 0], [0, 0, 0]])
>>> check_coords = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 2]])
>>> lapl_dummy = np.dstack([one, two, three])
>>> get_scale_local_maximas(check_coords, lapl_dummy)
array([[1, 0, 1]])
"""
cdef Py_ssize_t y_coord, x_coord, point_layer, point_index
cdef cnp.double_t point_response, lower_point_response, upper_point_response
cdef Py_ssize_t amount_of_layers = laplacian_cube.shape[2]
cdef Py_ssize_t amount_of_points = cube_coordinates.shape[0]
# Preallocate index. Fill it with False.
accepted_points_index = np.ones(amount_of_points, dtype=bool)
for point_index in range(amount_of_points):
 interest_point_coords = cube_coordinates[point_index]
 # Row coordinate
 y_coord = interest_point_coords[0]
 # Column coordinate
 x_coord = interest_point_coords[1]
 # Layer number starting from the smallest sigma
 point_layer = interest_point_coords[2]
 point_response = laplacian_cube[y_coord, x_coord, point_layer]
 # Check the point under the current one
 if point_layer != 0:
 lower_point_response = laplacian_cube[y_coord, x_coord, point_layer-1]
 if lower_point_response >= point_response:
 accepted_points_index[point_index] = False
 continue
 # Check the point above the current one
 if point_layer != (amount_of_layers-1):
 upper_point_response = laplacian_cube[y_coord, x_coord, point_layer+1]
 if upper_point_response >= point_response:
 accepted_points_index[point_index] = False
 continue
# Return only accepted points
return cube_coordinates[accepted_points_index]

And after that I made some benchmarks with my function and with suggested function that was vectorized:

%timeit compiled.get_scale_local_maximas_np(coords, lapl_dummy)
%timeit compiled.get_scale_local_maximas(coords, lapl_dummy)
%timeit dynamic.get_scale_local_maximas_np(coords, lapl_dummy)
%timeit dynamic.get_scale_local_maximas(coords, lapl_dummy)
10000 loops, best of 3: 101 μs per loop
1000 loops, best of 3: 328 μs per loop
10000 loops, best of 3: 103 μs per loop
1000 loops, best of 3: 1.6 ms per loop

The compiled namespace represents these two functions compiled using Cython.

The dynamic namespace represents usual Python file.

So, I made a conclusion that in this case the numpy approach is better.

Question 2

You're already doing 98% of the work in NumPy. Cython can't do much more.

Question 3

did you try using cython magic from ipython notebook? it is going to highlight with yellow the lines where most of the calculation is spent. you can also expand and collapse those lines to see the cython generated c-code. good luck!

Question 4

You might not need cython for this, but if you do us it try building your code using cython -a. The html cython generates will help you see what cython is doing. Lines that can be directly translated to C are left white and lines that require a lot of overhead are highlighted in different shades of yellow. Your goal is to get any important loops to be all white.

Question 5

You forgot to specify types for cube_coordinates and accepted_points_index.

Question 6

Your Python code could still be improved as you're not "already doing 98% in numpy": you're still iterating over the rows of the coordinate array and performing 1-2 checks per row.

You could use numpy's "fancy indexing" and masks to get it fully in a vectorized form:

def get_scale_local_maximas_full_np(coords, cube):
 x, y, z = [ coords[:, ind] for ind in range(3) ]
 point_responses = cube[x, y, z]
 lowers = point_responses.copy()
 uppers = point_responses.copy()
 not_layer_0 = z > 0
 lower_responses = cube[x[not_layer_0], y[not_layer_0], z[not_layer_0]-1]
 lowers[not_layer_0] = lower_responses 
 not_max_layer = z < (cube.shape[2] - 1)
 upper_responses = cube[x[not_max_layer], y[not_max_layer], z[not_max_layer]+1]
 uppers[not_max_layer] = upper_responses
 lo_check = np.ones(z.shape, dtype=np.bool)
 lo_check[not_layer_0] = (point_responses > lowers)[not_layer_0]
 hi_check = np.ones(z.shape, dtype=np.bool)
 hi_check[not_max_layer] = (point_responses > uppers)[not_max_layer]
 return coords[lo_check & hi_check]

I've generated a set of somewhat larger data to test performance with:

lapl_dummy = np.random.rand(100,100,100)
coords = np.random.random_integers(0,99, size=(1000,3))

I get the following timing results:

In [146]: %timeit get_scale_local_maximas_full_np(coords, lapl_dummy)
10000 loops, best of 3: 175 μs per loop
In [147]: %timeit get_scale_local_maximas(coords, lapl_dummy)
100 loops, best of 3: 2.24 ms per loop

But of course, be careful with performance tests, because it depends often on the data used.

I have little experience with Cython, can't help you there.

Oliver W. 13.6k3 gold badges41 silver badges52 bronze badges · Accepted Answer · 2015-03-19 23:23:58Z

Your Python code could still be improved as you're not "already doing 98% in numpy": you're still iterating over the rows of the coordinate array and performing 1-2 checks per row.

You could use numpy's "fancy indexing" and masks to get it fully in a vectorized form:

def get_scale_local_maximas_full_np(coords, cube):
 x, y, z = [ coords[:, ind] for ind in range(3) ]
 point_responses = cube[x, y, z]
 lowers = point_responses.copy()
 uppers = point_responses.copy()
 not_layer_0 = z > 0
 lower_responses = cube[x[not_layer_0], y[not_layer_0], z[not_layer_0]-1]
 lowers[not_layer_0] = lower_responses 
 not_max_layer = z < (cube.shape[2] - 1)
 upper_responses = cube[x[not_max_layer], y[not_max_layer], z[not_max_layer]+1]
 uppers[not_max_layer] = upper_responses
 lo_check = np.ones(z.shape, dtype=np.bool)
 lo_check[not_layer_0] = (point_responses > lowers)[not_layer_0]
 hi_check = np.ones(z.shape, dtype=np.bool)
 hi_check[not_max_layer] = (point_responses > uppers)[not_max_layer]
 return coords[lo_check & hi_check]

I've generated a set of somewhat larger data to test performance with:

lapl_dummy = np.random.rand(100,100,100)
coords = np.random.random_integers(0,99, size=(1000,3))

I get the following timing results:

In [146]: %timeit get_scale_local_maximas_full_np(coords, lapl_dummy)
10000 loops, best of 3: 175 μs per loop
In [147]: %timeit get_scale_local_maximas(coords, lapl_dummy)
100 loops, best of 3: 2.24 ms per loop

But of course, be careful with performance tests, because it depends often on the data used.

I have little experience with Cython, can't help you there.

CollectivesTM on Stack Overflow

Speed up function using cython

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

CollectivesTM on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related