I have a very basic OpenGL instanced rendering setup, which is compiling and running, however it is super slow, and even though I spent days of asking and reading how to fix it, I still have no clue, what causes the problem..
Overlapping small window
What does slow mean? At the moment as you can see, it draws
16 000 instances (48 000 vertices) @ 512*512px resolution / 38-43 FPS
But if I start scaling the window, up to the actual size of my monitor (2560 * 1440) the FPS drops down to 1. I expect at least half a million vertices rendered at 60FPS, that would be the goal.
The setup is very simple, I use GLFW to create the window, GLEW to setup OpenGL properly. So it looks something like this:
int main(void)
{
// ... init window and context
void *resources = setup();
// ... start of event loop
{
// ... clear, get buff data, viewport
draw(resources);
// ... swap buffs, poll events
}
cleanup(resources);
// ... clean up everything
return 0;
}
Now, the functions from the above pseudo snippet are here, they are in the instrender.c
file. This is where the actual drawing happening:
#include <stdlib.h> // srand(), rand()
#include <jemalloc/jemalloc.h> // malloc(), free()
#include <time.h> // time()
#include <stdio.h> // fprintf()
#include <GL/glew.h> // GL*
/*----------------------------------------------------------------------------*/
typedef struct resources
{
GLuint vs_id;
GLuint fs_id;
GLuint program_id;
GLuint coords_pos;
GLuint offset_pos;
GLuint colour_pos;
GLuint colour_buffer_id;
GLuint offset_buffer_id;
} Resources;
/*----------------------------------------------------------------------------*/
const char *vert_shader = " \
#version 150 core \n\
\n\
in vec2 coords; \n\
in vec2 offset; \n\
in vec3 colors; \n\
out vec3 color; \n\
\n\
void main() \n\
{ \n\
gl_Position = vec4(coords.x + offset.x, \n\
coords.y + offset.y, 0.0, 1.0); \n\
color = colors; \n\
} \n";
/*----------------------------------------------------------------------------*/
const char *frag_shader = " \
#version 150 core \n\
\n\
in vec3 color; \n\
out vec4 fragment; \n\
\n\
void main() \n\
{ \n\
fragment = vec4(color, 1.0); \n\
} \n";
/*----------------------------------------------------------------------------*/
void
load_and_compile_shader(GLuint *shader_id, const char *buffer, GLenum type)
{
// Get shader_id, pass shader source, compile it
*shader_id = glCreateShader(type);
glShaderSource(*shader_id, 1, (const GLchar **)&buffer, NULL);
glCompileShader(*shader_id);
// If there was a problem during the compilation
GLint is_compiled;
glGetShaderiv(*shader_id, GL_COMPILE_STATUS, &is_compiled);
if (!is_compiled)
{
fprintf(stderr, "Shader failed to compile\n");
// Get debugging information
GLint info_log_length;
glGetShaderiv(*shader_id, GL_INFO_LOG_LENGTH, &info_log_length);
GLchar *info_log = malloc(info_log_length*sizeof(GLchar));
glGetShaderInfoLog(*shader_id, info_log_length, NULL, info_log);
fprintf(stderr, "%s\n", info_log);
glDeleteShader(*shader_id);
free(info_log);
*shader_id = 0;
return;
}
}
/*----------------------------------------------------------------------------*/
#define IR_OFFSET_COUNT 16000
#define IR_COLOUR_COUNT IR_OFFSET_COUNT * 3
void *
setup(void)
{
Resources *rsc = malloc(sizeof(Resources));
if (!rsc)
{
fprintf(stderr, "Failed to allocate space for resources\n");
return (void *)NULL;
}
load_and_compile_shader(&rsc->vs_id, vert_shader, GL_VERTEX_SHADER);
load_and_compile_shader(&rsc->fs_id, frag_shader, GL_FRAGMENT_SHADER);
// Create new program and get program ID
rsc->program_id = glCreateProgram();
// Attach shaders
glAttachShader(rsc->program_id, rsc->vs_id);
glAttachShader(rsc->program_id, rsc->fs_id);
// Vertex coordinates
GLfloat vertices[] = { -.95f, -.95f,
-.95f, +.00f,
-.70f, -.95f };
// Vertex indices
GLushort indices[] = {0, 1, 2};
// Instance offsets
GLfloat offset[IR_OFFSET_COUNT];
srand(time(NULL));
for (int i=0; i<IR_OFFSET_COUNT; i++)
offset[i] = (GLfloat)(rand() % 200) / 100.f;
// Color values
GLfloat colors[IR_COLOUR_COUNT];
for (int i=0; i<IR_COLOUR_COUNT; i++)
colors[i] = (GLfloat)rand() / (GLfloat)RAND_MAX;
// Shader layout position
int pos_index = 0;
// Setup VAO
GLuint vertex_array_id;
glGenVertexArrays(1, &vertex_array_id);
glBindVertexArray(vertex_array_id);
// Setup coordinates VBO
GLuint vertex_buffer_id;
glGenBuffers(1, &vertex_buffer_id);
glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer_id);
glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
rsc->coords_pos = pos_index;
glBindAttribLocation(rsc->program_id, pos_index++, "coords");
glVertexAttribPointer(rsc->coords_pos,
2,
GL_FLOAT,
GL_FALSE,
2*sizeof(GLfloat),
(GLvoid *)NULL);
glEnableVertexAttribArray(rsc->coords_pos);
// Setup offsets VBO
glGenBuffers(1, &rsc->offset_buffer_id);
glBindBuffer(GL_ARRAY_BUFFER, rsc->offset_buffer_id);
glBufferData(GL_ARRAY_BUFFER, sizeof(offset), offset, GL_STATIC_DRAW);
rsc->offset_pos = pos_index;
glBindAttribLocation(rsc->program_id, pos_index++, "offset");
glVertexAttribPointer(rsc->offset_pos,
2,
GL_FLOAT,
GL_FALSE,
2*sizeof(GLfloat),
(GLvoid *)NULL);
glEnableVertexAttribArray(rsc->offset_pos);
glVertexAttribDivisor(rsc->offset_pos, 1);
// Setup colors VBO
glGenBuffers(1, &rsc->colour_buffer_id);
glBindBuffer(GL_ARRAY_BUFFER, rsc->colour_buffer_id);
glBufferData(GL_ARRAY_BUFFER, sizeof(colors), colors, GL_STATIC_DRAW);
rsc->colour_pos = pos_index;
glBindAttribLocation(rsc->program_id, pos_index++, "colors");
glVertexAttribPointer(rsc->colour_pos,
3,
GL_FLOAT,
GL_FALSE,
3*sizeof(GLfloat),
(GLvoid *)NULL);
glEnableVertexAttribArray(rsc->colour_pos);
glVertexAttribDivisor(rsc->colour_pos, 1);
// Link shader program
glLinkProgram(rsc->program_id);
// If there was a problem during the linking
GLint is_linked;
glGetProgramiv(rsc->program_id, GL_LINK_STATUS, &is_linked);
if (!is_linked)
{
fprintf(stderr, "Shader program linking failed.\n");
// Get debugging informations
GLint info_log_length;
glGetProgramiv(rsc->program_id, GL_INFO_LOG_LENGTH, &info_log_length);
GLchar *info_log = malloc(info_log_length*sizeof(GLchar));
glGetProgramInfoLog(rsc->program_id, info_log_length, NULL, info_log);
fprintf(stderr, "%s\n", info_log);
// Clean up
glDetachShader(rsc->program_id, rsc->vs);
glDetachShader(rsc->program_id, rsc->fs);
glDeleteProgram(rsc->program_id);
free(info_log);
rsc->program_id = 0;
return;
}
// Setup indices VBO
GLuint index_array_buffer_id;
glGenBuffers(1, &index_array_buffer_id);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_array_buffer_id);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
// Set fragment output
glBindFragDataLocation(rsc->program_id, 0, "fragment");
// Set basic GL options
glClearColor(.46f, .71f, .67f, 1.f);
glEnable(GL_BLEND);
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
// Start using program
glUseProgram(rsc->program_id);
return rsc;
}
/*----------------------------------------------------------------------------*/
void
draw(void *resources)
{
Resources *rsc = (Resources *)resources;
glDrawElementsInstanced(GL_TRIANGLES,
/* num of elems to draw */ 3,
/* index value types */ GL_UNSIGNED_SHORT,
/* pointer to indices */ 0,
/* num of items to draw */ IR_OFFSET_COUNT);
}
/*----------------------------------------------------------------------------*/
void
cleanup(void *resources)
{
glBlendFunc(GL_ONE, GL_ZERO);
glDisable(GL_BLEND);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glUseProgram(0);
Resources *rsc = (Resources *)resources;
glDeleteShader(rsc->vs_id);
glDeleteShader(rsc->fs_id);
glDisableVertexAttribArray(rsc->coords_pos);
glDisableVertexAttribArray(rsc->offset_pos);
glDeleteProgram(rsc->program_id);
free(rsc);
}
Environment:
Video Card:
NVidia GeForce 9600M GT 512MB
OS/Compiler:
Mac OS X 10.9.3 / Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn)
UPDATE 1:
Based on a friendly advice, which was about "over-drawing", I created a version, where the triangles only have a very limited overlapping in one direction:
Non-overlapping small window
Now this produces a constant 48-50FPS
, however when I scale the window up to 2560*1440 this number drops down to 22-26FPS
(which is of course way better than the previous 1FPS
but still not the one I'm looking for):
Non-overlapping full screen
So I guess, the main problem is not over-drawing/overlapping.
UPDATE 2:
Here is a time profile I created:
Instrument Profile
As you can see, 75.7% of the time is spent to call/execute the glDrawElementsInstanced
function, and its subfunction calls.
UPDATE 3:
During the tests of the code with @syb0rg, another interesting bug appeared: on every 10-15-20th (basically at absolutely random) running the program produces this, and then crashes:
Noise Animation
1 Answer 1
Disclaimer: This is the first time I have ever really looked into using OpenGL
Keep in mind that my review may contain code that is not fully included in the question (such as the simplified main()
function).
Bugs
I found that when I tried to resize the window, I would always crash the program.
enter image description here
That wasn't very fun, so I set out to fix that first and foremost. It was rather easy to fix, all I had to do was add a frame buffer sizing callback, and set it to the window with
glfwSetFramebufferSizeCallback()
.The frame buffer callback function:
static void framebuffer_size_callback(GLFWwindow* window, int width, int height) { glViewport(0, 0, width, height); // reset the viewport glMatrixMode(GL_PROJECTION); // modify the projection matrix glLoadIdentity(); // load an identity matrix into the projection matrix glOrtho(0, width, 0, height, -1.0, 1.0); // create new projection matrix /// Important!!! You need to switch back to the model-view matrix /// or else your OpenGL calls are modifying the projection matrix! glMatrixMode(GL_MODELVIEW); // return to the model matrix glLoadIdentity(); // load an identity matrix into the model-view matrix // OpenGL has now compensated for the resized window, and is ready to draw again. }
The modified
run()
function:void run(GLFWwindow *window) { void *resources = setup(); glfwSetWindowUserPointer(window, resources); #ifdef MEASURE glfwSwapInterval(0); #endif int viewport_width, viewport_height; // set for proper resizing of window and viewport glfwSetFramebufferSizeCallback(window, framebuffer_size_callback); while (!glfwWindowShouldClose(window)) { glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); glfwGetFramebufferSize(window, &viewport_width, &viewport_height); framebuffer_size_callback(window, viewport_width, viewport_height); draw(resources); glfwSwapBuffers(window); glfwPollEvents(); #ifdef MEASURE printfps(); #endif } cleanup(resources); }
This doesn't fully fix the crashing, occasionally the program will still misbehave; however, at least I am able to scale the window now more consistently.
When I undefined
OVERLAPPING_OFF
to benchmark the code on my computer, I got some errors of an unknown type nameoffset
. Looking at your code, you definedoffset
only ifOVERLAPPING_OFF
is defined but then try to use it if it isn't defined as well. The fix was simple enough, I just moved the declaration ofoffset
outside of the#ifdef OVERLAPPING_OFF
.// Instance offsets int i; GLfloat offset[OFFSET_COUNT]; #ifdef OVERLAPPING_OFF int j, idx=0; GLfloat x, y; for (i=0; i<DIMi; i++) { y = (GLfloat)i * STEP; for (j=0; j<DIMi; j+=2) { offset[idx++] = x = (GLfloat)j * STEP; // x offset[idx++] = y; // y } } #else for (i=0; i<OFFSET_COUNT; i++) offset[i] = genc(); #endif
Optimization
Running my own profiling tests for a longer duration, I came up with the following data.
enter image description here
As we can see,
glDrawElementsInstanced()
now only takes up 2% of the total run time. The big time hogs areCGLFlushDrawable
and_glfwPlatformPollEvents
.There are two ways to process pending events.
glfwPollEvents()
processes only those events that have already been received and then returns immediately. This is the best choice when rendering continually, like most games do.If instead you only need to update your rendering once you have received new input,
glfwWaitEvents()
is a better choice. It waits until at least one event has been received, putting the thread to sleep in the meantime, and then processes all received events just likeglfwPollEvents()
does. This saves a great deal of CPU cycles and is useful for, for example, many kinds of editing tools.
-
\$\begingroup\$ thanks, I completely forgot about the frame buffer size callback -- nice catch! The offset was a typo, but thanks for pointing that too! \$\endgroup\$Peter Varo– Peter Varo2014年05月27日 17:12:51 +00:00Commented May 27, 2014 at 17:12
-
\$\begingroup\$ @PeterVaro Oops, I accidentally rejected your edit when I made the edit of my own! If you re-submit it I'll approve. \$\endgroup\$syb0rg– syb0rg2014年05月27日 17:22:11 +00:00Commented May 27, 2014 at 17:22
-
\$\begingroup\$ okay, np, I updated again \$\endgroup\$Peter Varo– Peter Varo2014年05月27日 17:24:54 +00:00Commented May 27, 2014 at 17:24
-
\$\begingroup\$ btw: if
glfwSetFramebufferSizeCallback
is called -- so a callback function set -- then calling theglGetFramebufferSize
is absolutely unnecessary. \$\endgroup\$Peter Varo– Peter Varo2014年05月27日 17:30:44 +00:00Commented May 27, 2014 at 17:30 -
\$\begingroup\$ @PeterVaro
glfwGetFramebufferSize
directly retrieves the current size of the framebuffer of a window, and stores it inviewport_width
andviewport_height
. It is still a necessary function call (you can see this when you remove the function call and run the program). \$\endgroup\$syb0rg– syb0rg2014年05月27日 17:34:31 +00:00Commented May 27, 2014 at 17:34
void*
unless you have to (e.g. because it can be any kind of pointer). In your case you know what it is so you can use the proper type. \$\endgroup\$void *
s in the first place. Ofc the above example is a bit simplified version of the actual code, but in real life, it has to be avoid *
since the caller does not know what pointer that is. \$\endgroup\$make && ./build/main
-- and don't press space, ifdefOVERLAPPING_OFF
. thank you very much for the effort! \$\endgroup\$