Data set generation with Python and Matplotlib

Question 1

I want to use Python and Matplotlib to create a data set of images. I want to do this by transforming the plot to a numpy matrix. At the moment it takes about 0.1 second to generate 100 images of size 50x50 pixels:

enter image description here

Question: How can I speed up things a lot?

This is my code:

import matplotlib.pyplot as plt
import numpy as np
import time
import cv2
fig = plt.figure(frameon=False)
ax = fig.add_axes([0., 0., 1., 1.])
fig.set_size_inches((0.5,0.5))
fig_size = fig.canvas.get_width_height() + (3,)
points, = ax.plot([],[])
ax.set_xlim(0.,1.)
ax.set_ylim(0.,1.)
ax.set_axis_off()
ax.set_frame_on(False)
ax.grid(False)
def plot2mat():
 data = np.random.rand(20,2)
 points.set_xdata(data[:,0])
 points.set_ydata(data[:,1])
 fig.canvas.draw()
 fig.canvas.flush_events()
 M = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
 M = M.reshape(fig_size)
 M = cv2.cvtColor(M, cv2.COLOR_BGR2GRAY)
 return M
I = []
t0 = time.time()
runs = 1000
for k in range(runs):
 I.append(plot2mat())
print(time.time()-t0)
plt.close(fig)
for k in range(100):
 plt.subplot(10,10,k+1)
 plt.imshow(I[k])
 plt.axis("off")
plt.savefig("plot.png", dpi=400)

Question 2

An execution time of 0.1 second is not much. Do you want to scale this up by much that you're concerned about speed? What's your goal?

Question 3

@Mast I call plot2mat() from another script tens of thousands of times which makes this part a real bottleneck.

Question 4

First: what is actually being plotted? Each image data M is a 50*50 array of uint8, with only two values: 127 and 255 (matplotlib makes the yellow background due to the colour map). Each line is three pixels wide with no antialiasing. There are 20 vertices making 19 lines per image.

I think that the code is way too tightly-coupled with matplotlib. The following code does not help; it's about five times slower and much more complicated, but it demonstrates that generation of your dataset is possible without matplotlib. It uses linear algebra in up to five dimensions.

import time
import typing
import numpy as np
def generate_data(
 rand: np.random.Generator,
 n_images: int = 100,
 n_lines: int = 19,
 line_width: float = 3.,
 image_width: int = 50,
 norm_method: typing.Literal['einsum_param', 'einsum_diff', 'linalg'] = 'einsum_diff',
) -> np.ndarray:
 # coordinate space in each image
 ii = np.arange(image_width) # *50
 xy = np.stack(np.meshgrid(ii, ii), axis=-1) # 50*50*2
 # random start/end vertices for all line segments
 vertices = rand.integers(low=0, high=image_width, size=(1 + n_lines, n_images, 2)) # 20*2
 p0 = vertices[:-1] # 19*100*2
 p1 = vertices[1:] # 19*100*2
 # linear parameters for singularity-free homogeneous form [abc].[xy1] = 0
 dp = p1 - p0 # 19*100*2
 c = p0[...,1]*dp[...,0] - p0[...,0]*dp[...,1] # *19
 ab = dp[..., ::-1]*(1, -1) # 19*2
 a = ab[..., 0] # 19*100
 b = ab[..., 1] # 19*100
 # matrix for projection onto the line
 denom = 1/np.einsum('ijk,ijk->ij', ab, ab) # 19*100
 ba22 = np.broadcast_to(ab[..., ::-1, np.newaxis], ab.shape + (2,)) # 19*100*2*2
 projection = ( # 19*100*2*2
 denom[..., np.newaxis, np.newaxis] * ba22 * ba22.transpose((0, 1, 3, 2))
 ) * (
 (1, -1),
 (-1, 1),
 )
 # projected points
 xyp = ( # 19*100*50*50*2
 np.einsum('ijk,abkl->abijl', xy, projection)
 ) - (
 (c*denom)[..., np.newaxis]*ab
 )[:, :, np.newaxis, np.newaxis, :]
 # distances to line, three methods
 if norm_method == 'einsum_param':
 dist = np.abs( # 19*100*50*50
 np.einsum('ijk,abk->abij', xy, ab)
 + c[..., np.newaxis, np.newaxis]
 )/np.hypot(a, b)[..., np.newaxis, np.newaxis]
 bound = 0.5*line_width
 elif norm_method == 'einsum_diff':
 diff = xy - xyp # 19*100*50*50*2
 dist = np.einsum('ijklm,ijklm->ijkl', diff, diff) # 19*100*50*50
 bound = (0.5*line_width)**2
 elif norm_method == 'linalg':
 dist = np.linalg.norm(xy - xyp, axis=-1) # 19*100*50*50
 bound = 0.5*line_width
 # endpoint test: if 0 <= lhs <= rhs, then the projected point is on the segment
 lhs = (xyp - p0[:,:, np.newaxis, np.newaxis, :]).sum(axis=-1) # 19*50*50
 rhs = dp.sum(axis=-1) # *19
 r_sign = np.sign(rhs) # *19
 lhs *= r_sign[..., np.newaxis, np.newaxis]
 rhs *= r_sign
 data = np.full(shape=(n_images, image_width, image_width), fill_value=255, dtype=np.uint8) # 50*50
 data[
 (
 # width criteria
 (dist <= bound)
 # endpoint criteria
 & (lhs >= 0) & (lhs <= rhs[..., np.newaxis, np.newaxis])
 ).any(axis=0)
 ] = 128
 return data
def demo() -> None:
 rand = np.random.default_rng(seed=0)
 for i in range(3):
 for method in (
 'linalg', 'einsum_param', 'einsum_diff',
 ):
 start = time.perf_counter()
 generate_data(rand=rand, norm_method=method)
 dur = time.perf_counter() - start
 print(method, dur)
if __name__ == '__main__':
 demo()

Guessing at a method that would be more efficient (short of throwing everything out and writing in C), you could try ImageDraw.line (for a single line segment), or perhaps LineCollection.

Reinderien Reinderien 71k5 gold badges76 silver badges256 bronze badges · Answer 1 · 2024-11-17 01:42:10Z

First: what is actually being plotted? Each image data M is a 50*50 array of uint8, with only two values: 127 and 255 (matplotlib makes the yellow background due to the colour map). Each line is three pixels wide with no antialiasing. There are 20 vertices making 19 lines per image.

I think that the code is way too tightly-coupled with matplotlib. The following code does not help; it's about five times slower and much more complicated, but it demonstrates that generation of your dataset is possible without matplotlib. It uses linear algebra in up to five dimensions.

import time
import typing
import numpy as np
def generate_data(
 rand: np.random.Generator,
 n_images: int = 100,
 n_lines: int = 19,
 line_width: float = 3.,
 image_width: int = 50,
 norm_method: typing.Literal['einsum_param', 'einsum_diff', 'linalg'] = 'einsum_diff',
) -> np.ndarray:
 # coordinate space in each image
 ii = np.arange(image_width) # *50
 xy = np.stack(np.meshgrid(ii, ii), axis=-1) # 50*50*2
 # random start/end vertices for all line segments
 vertices = rand.integers(low=0, high=image_width, size=(1 + n_lines, n_images, 2)) # 20*2
 p0 = vertices[:-1] # 19*100*2
 p1 = vertices[1:] # 19*100*2
 # linear parameters for singularity-free homogeneous form [abc].[xy1] = 0
 dp = p1 - p0 # 19*100*2
 c = p0[...,1]*dp[...,0] - p0[...,0]*dp[...,1] # *19
 ab = dp[..., ::-1]*(1, -1) # 19*2
 a = ab[..., 0] # 19*100
 b = ab[..., 1] # 19*100
 # matrix for projection onto the line
 denom = 1/np.einsum('ijk,ijk->ij', ab, ab) # 19*100
 ba22 = np.broadcast_to(ab[..., ::-1, np.newaxis], ab.shape + (2,)) # 19*100*2*2
 projection = ( # 19*100*2*2
 denom[..., np.newaxis, np.newaxis] * ba22 * ba22.transpose((0, 1, 3, 2))
 ) * (
 (1, -1),
 (-1, 1),
 )
 # projected points
 xyp = ( # 19*100*50*50*2
 np.einsum('ijk,abkl->abijl', xy, projection)
 ) - (
 (c*denom)[..., np.newaxis]*ab
 )[:, :, np.newaxis, np.newaxis, :]
 # distances to line, three methods
 if norm_method == 'einsum_param':
 dist = np.abs( # 19*100*50*50
 np.einsum('ijk,abk->abij', xy, ab)
 + c[..., np.newaxis, np.newaxis]
 )/np.hypot(a, b)[..., np.newaxis, np.newaxis]
 bound = 0.5*line_width
 elif norm_method == 'einsum_diff':
 diff = xy - xyp # 19*100*50*50*2
 dist = np.einsum('ijklm,ijklm->ijkl', diff, diff) # 19*100*50*50
 bound = (0.5*line_width)**2
 elif norm_method == 'linalg':
 dist = np.linalg.norm(xy - xyp, axis=-1) # 19*100*50*50
 bound = 0.5*line_width
 # endpoint test: if 0 <= lhs <= rhs, then the projected point is on the segment
 lhs = (xyp - p0[:,:, np.newaxis, np.newaxis, :]).sum(axis=-1) # 19*50*50
 rhs = dp.sum(axis=-1) # *19
 r_sign = np.sign(rhs) # *19
 lhs *= r_sign[..., np.newaxis, np.newaxis]
 rhs *= r_sign
 data = np.full(shape=(n_images, image_width, image_width), fill_value=255, dtype=np.uint8) # 50*50
 data[
 (
 # width criteria
 (dist <= bound)
 # endpoint criteria
 & (lhs >= 0) & (lhs <= rhs[..., np.newaxis, np.newaxis])
 ).any(axis=0)
 ] = 128
 return data
def demo() -> None:
 rand = np.random.default_rng(seed=0)
 for i in range(3):
 for method in (
 'linalg', 'einsum_param', 'einsum_diff',
 ):
 start = time.perf_counter()
 generate_data(rand=rand, norm_method=method)
 dur = time.perf_counter() - start
 print(method, dur)
if __name__ == '__main__':
 demo()

Guessing at a method that would be more efficient (short of throwing everything out and writing in C), you could try ImageDraw.line (for a single line segment), or perhaps LineCollection.

Stack Exchange Network

Data set generation with Python and Matplotlib

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Data set generation with Python and Matplotlib

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions