I'm building a spiking neural net (recurrent, integrate and fire), and I'm curious about how to reduce the warp divergence (and other problems) I may have.
Here's an example with a few hand-placed neurons and synapses for a better apprehension. I upload the whole code on a Git repo for faster access, make tui
then ./cudasnn
to try.
The very basic workflow is to execute the 4 kernels (explanation in the comments) one after another.
After 5000 cycles, here's the time in ms for each kernel in order:
- 1 - 0.196ms
- 2 - 3.558ms
- 3 - 0.038ms
- 4 - 4.416ms to 6.278ms
My code is split into three files, whose name are pretty explicit.
NN.hpp (which contains my structures)
#ifndef NN_HPP
# define NN_HPP
# include <iostream>
/* Window Setting -- for SFML, no need here */
# define WIDTH 1280
# define HEIGHT 720
# define XOFFSET -0
# define YOFFSET -95
/* Network Settings */
# define STIMULUS 1
# define INHIBITION -1
# define STIMULUS_RATIO 80
# define SPIKE_BUFFER 4
# define INPUT 0
# define OUTPUT 1
# define HIDDEN 2
typedef struct s_neuron_info
{
float x, y, z;
char gid;
unsigned char group; // hidden, input, output
} t_neuron_info;
typedef struct s_neuron
{
short in_time;
float in_value;
int next_time;
float action_potential;
float threshold; // fire when threshold reached
float weight;
char type; // stimulus, inhibition
char carry;
} t_neuron;
typedef struct s_synapse
{
int id_in;
int id_out;
int axonal_delay; // in timestep
} t_synapse;
typedef struct s_spike
{
int syn_id;
int id_out;
int start_t, end_t; // in timestep
float value;
bool active;
} t_spike;
typedef struct s_network
{
t_neuron *neurons;
t_neuron_info *neurons_info;
t_synapse *synapses;
t_spike *spikes;
int neur_count;
int syn_count;
int timestep;
int group_size;
} t_network;
/* Generation */
t_network *generate_network(void);
/* GPU Simulation */
float simulate_network(t_network *nw);
#endif
simulate_network.cu (which contains the kernels and a small temporary main)
#include <cuda.h>
#include "NN.hpp"
#define cudaErrorAbort(msg) \
{ \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "[CUDA Error] %s : %s (%s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
exit(1); \
} \
}
/* First Kernel
* ============
* Neuronal parallelism on the INPUT group,
* check if an INPUT neuron should fire or not.
*/
__global__ void check_input(t_neuron *n, int timestep)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (timestep >= n[idx].next_time)
{
n[idx].action_potential += n[idx].in_value;
n[idx].next_time += n[idx].in_time;
}
}
/* Second Kernel
* =============
* Synaptic parallelism, check pre-synaptic neurons
* and create a spike if AP >= threshold
*/
__global__ void check_synapses(t_neuron *n, t_synapse *s, t_spike *sp, int timestep, int scount)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int j;
if (n[s[idx].id_in].action_potential < n[s[idx].id_in].threshold)
return ;
n[s[idx].id_in].carry = 1;
for (int i = idx * SPIKE_BUFFER; i < idx * SPIKE_BUFFER + SPIKE_BUFFER; ++i)
{
if (sp[i].active)
continue;
j = i;
break ;
}
sp[j].syn_id = idx;
sp[j].id_out = s[idx].id_out;
sp[j].start_t = timestep;
sp[j].end_t = timestep + s[idx].axonal_delay;
sp[j].value = n[s[idx].id_in].action_potential * 0.2f * n[s[idx].id_in].type;
sp[j].active = true;
}
__global__ void carry_reset(t_neuron *n, int ncount)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
/* Might be faster depending of the architecture */
// n[idx].action_potential -= (n[idx].carry * n[idx].action_potential);
// n[idx].carry = 0;
if (n[idx].carry)
{
n[idx].action_potential = 0.0f;
n[idx].carry = 0;
}
}
/* Third Kernel
* ============
* Process the "multi-circular" buffer by batch of SPIKE_BUFFER, and update
* the AP of the post-synaptic neuron (+ kill the spike) if needed
*/
__global__ void check_spikes(t_neuron *n, t_spike *sp, int timestep, int scount)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx * SPIKE_BUFFER; i < idx * SPIKE_BUFFER + SPIKE_BUFFER; ++i)
{
if (timestep >= sp[i].end_t)
{
sp[i].active = false;
sp[i].end_t = INT_MAX; // to avoid the 'continue' branching
n[sp[i].id_out].action_potential += (sp[i].value * n[sp[i].id_out].weight);
if (n[sp[i].id_out].action_potential < 0.0f)
n[sp[i].id_out].action_potential = 0.0f;
}
}
}
float simulate_network(t_network *nw)
{
static t_neuron *neur_cptr = NULL;
static t_synapse *syn_cptr = NULL;
static t_spike *spike_cptr = NULL;
static size_t cp_size = sizeof(t_neuron) * ((nw->neur_count + 511) & ~511);
static size_t cp_syn_size = sizeof(t_synapse) * ((nw->syn_count + 511) & ~511);
static size_t cp_spike_size = sizeof(t_spike) * ((nw->syn_count + 511) & ~511) * SPIKE_BUFFER;
/* add some more space to avoid useless conditions in kernel */
if (!neur_cptr)
{
/* Malloc/cpy for neurons/synapses/spikes */
cudaMalloc((void**) &neur_cptr, cp_size);
cudaMemcpy(neur_cptr, nw->neurons, cp_size, cudaMemcpyHostToDevice);
cudaMalloc((void**) &syn_cptr, cp_syn_size);
cudaMemcpy(syn_cptr, nw->synapses, cp_syn_size, cudaMemcpyHostToDevice);
cudaMalloc((void**) &spike_cptr, cp_spike_size);
cudaMemcpy(spike_cptr, nw->spikes, cp_spike_size, cudaMemcpyHostToDevice);
cudaErrorAbort("CUDA Malloc/Memcpy Error.");
}
/* Block size/N for kernel 1, 2 and 3 */
int syn_block_size = 512 > nw->syn_count ? nw->syn_count : 512;
int syn_block_count = nw->syn_count / syn_block_size
+ (!(nw->syn_count % syn_block_size) ? 0 : 1);
int carry_block_size = 512 > nw->neur_count ? nw->neur_count : 512;
int carry_block_count = nw->neur_count / carry_block_size
+ (!(nw->neur_count % carry_block_size) ? 0 : 1);
cudaEvent_t start, stop;
float elapsed_time;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
/* Kernel call */
check_input <<< 1, nw->group_size >>> (neur_cptr, nw->timestep);
cudaDeviceSynchronize();
cudaErrorAbort("CUDA Kernel Error 1.");
check_synapses <<< syn_block_count, syn_block_size >>> (neur_cptr,
syn_cptr, spike_cptr, nw->timestep, nw->syn_count);
cudaDeviceSynchronize();
cudaErrorAbort("CUDA Kernel Error 2.");
carry_reset <<< carry_block_count, carry_block_size >>> (neur_cptr, nw->neur_count);
cudaDeviceSynchronize();
cudaErrorAbort("CUDA Kernel Error 3.");
check_spikes <<< syn_block_count, syn_block_size >>> (neur_cptr,
spike_cptr, nw->timestep, nw->syn_count);
cudaDeviceSynchronize();
cudaErrorAbort("CUDA Kernel Error 4.");
/* Timer */
cudaEventCreate(&stop); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed_time, start, stop);
printf("[%04d] Elapsed time: %f ms\n\n", nw->timestep, elapsed_time);
cudaEventCreate(&start); cudaEventRecord(start, 0);
/* Cpy back */
cudaMemcpy(nw->neurons, neur_cptr, cp_size, cudaMemcpyDeviceToHost);
cudaMemcpy(nw->synapses, syn_cptr, cp_syn_size, cudaMemcpyDeviceToHost);
cudaMemcpy(nw->spikes, spike_cptr, cp_spike_size, cudaMemcpyDeviceToHost);
cudaErrorAbort("CUDA Memcpy Error.");
return (elapsed_time);
}
int main(void)
{
t_network *nw;
srand(time(NULL));
if (!(nw = generate_network()))
return (1);
while (43)
{
simulate_network(nw);
++nw->timestep;
}
// return (0);
}
generate_network.cpp (which initialize my structures)
#include <stdlib.h>
#include <time.h>
#include "NN.hpp"
int randab(int a, int b)
{
return (rand() % (b - a) + a);
}
double frandab(double a, double b)
{
return ((rand() / (double)RAND_MAX) * (b - a) + a);
}
void init_neuron(t_neuron_info *nfo, t_neuron *ret, int id, int x, int y,
char type)
{
nfo->x = x;
nfo->y = y;
nfo->z = 0;
nfo->gid = id / 100;
ret->in_value = frandab(0.8f, 3.0f);
ret->in_time = randab(50, 100);
ret->next_time = ret->in_time * frandab(0.2f, 1.0f);
ret->action_potential = 0.0f;
ret->threshold = frandab(6.5f, 10.5f);
ret->weight = frandab(0.3f, 1.2f);
ret->type = type;
nfo->group = (nfo->gid == 0 ? INPUT : (nfo->gid == 1 ? OUTPUT : HIDDEN));
ret->carry = 0;
}
void init_synapse(t_synapse *ret, int in, int out)
{
ret->id_in = in;
ret->id_out = out;
ret->axonal_delay = randab(5, 30);
}
t_network *generate_network(void)
{
t_network *ret;
if (!(ret = (t_network*)malloc(sizeof(t_network))))
return (NULL);
ret->timestep = 0;
ret->group_size = 100;
/* Alloc */
ret->neur_count = 20000;
if (!(ret->neurons = (t_neuron*)malloc(sizeof(t_neuron)
* ((ret->neur_count + 511) & ~511)))
|| !(ret->neurons_info = (t_neuron_info*)malloc(sizeof(t_neuron_info)
* ((ret->neur_count + 511) & ~511))))
return (NULL);
ret->syn_count = 400000;
if (!(ret->synapses = (t_synapse*)malloc(sizeof(t_synapse)
* ((ret->syn_count + 511) & ~511))))
return (NULL);
if (!(ret->spikes = (t_spike*)malloc(sizeof(t_spike)
* ((ret->syn_count + 511) & ~511) * SPIKE_BUFFER)))
return (NULL);
/* Assign */
for (int i = 0; i < ret->neur_count; ++i)
init_neuron(&(ret->neurons_info[i]), &(ret->neurons[i]), i,
(i / 12) * 11, (i % 60) * 18,
randab(0, 100) >= STIMULUS_RATIO ? INHIBITION : STIMULUS);
for (int i = 0; i < ret->syn_count; ++i)
{
int in = randab(0, ret->neur_count);
int out = (randab(0, 2) || ret->neurons_info[in].gid == INPUT
? randab(0, ret->neur_count)
: randab((int)(in / ret->group_size) * ret->group_size,
((int)(in / ret->group_size) + 1) * ret->group_size));
while (in == out)
out = randab(0, ret->neur_count);
init_synapse(&(ret->synapses[i]), in, out);
}
for (int i = 0; i < ret->syn_count * 4; ++i)
ret->spikes[i].active = false;
return (ret);
}
Everything should run nicely using:
nvcc generate_network.cpp simulate_network.cu
1 Answer 1
As a first step, remove as many conditional branches as possible. Take a functional programming approach. You added a lot of conditional returns for error checking that can be removed if your arrays are set up to accommodate all inputs.
Conversion to functional programming example:
if (n[idx].carry)
{
n[idx].action_potential = 0.0f;
n[idx].carry = 0;
}
becomes:
n[idx].action_potential = n[idx].action_potential - (n[idx].carry * n[idx].action_potential);
n[idx].carry = 0;
-
\$\begingroup\$ Thanks, I'll take a look at my error checking -- in this example, won't it be a loss to add maths to remove a condition which is at the end of the function anyway? \$\endgroup\$Hyllis– Hyllis2015年07月05日 16:39:14 +00:00Commented Jul 5, 2015 at 16:39
-
\$\begingroup\$ I try this, and unfortunately, I lose 0.02ms. -- and I don't really find any other condition that aren't essential. \$\endgroup\$Hyllis– Hyllis2015年07月05日 17:28:30 +00:00Commented Jul 5, 2015 at 17:28