8
\$\begingroup\$

I'm building a spiking neural net (recurrent, integrate and fire), and I'm curious about how to reduce the warp divergence (and other problems) I may have. Here's an example with a few hand-placed neurons and synapses for a better apprehension. I upload the whole code on a Git repo for faster access, make tui then ./cudasnn to try.

The very basic workflow is to execute the 4 kernels (explanation in the comments) one after another.

After 5000 cycles, here's the time in ms for each kernel in order:

  • 1 - 0.196ms
  • 2 - 3.558ms
  • 3 - 0.038ms
  • 4 - 4.416ms to 6.278ms

My code is split into three files, whose name are pretty explicit.

NN.hpp (which contains my structures)

#ifndef NN_HPP
# define NN_HPP
# include <iostream>
/* Window Setting -- for SFML, no need here */
# define WIDTH 1280
# define HEIGHT 720
# define XOFFSET -0
# define YOFFSET -95
/* Network Settings */
# define STIMULUS 1
# define INHIBITION -1
# define STIMULUS_RATIO 80
# define SPIKE_BUFFER 4
# define INPUT 0
# define OUTPUT 1
# define HIDDEN 2
typedef struct s_neuron_info
{
 float x, y, z;
 char gid;
 unsigned char group; // hidden, input, output
} t_neuron_info;
typedef struct s_neuron
{
 short in_time;
 float in_value;
 int next_time;
 float action_potential;
 float threshold; // fire when threshold reached
 float weight;
 char type; // stimulus, inhibition
 char carry;
} t_neuron;
typedef struct s_synapse
{
 int id_in;
 int id_out;
 int axonal_delay; // in timestep
} t_synapse;
typedef struct s_spike
{
 int syn_id;
 int id_out;
 int start_t, end_t; // in timestep
 float value;
 bool active;
} t_spike;
typedef struct s_network
{
 t_neuron *neurons;
 t_neuron_info *neurons_info;
 t_synapse *synapses;
 t_spike *spikes;
 int neur_count;
 int syn_count;
 int timestep;
 int group_size;
} t_network;
/* Generation */
t_network *generate_network(void);
/* GPU Simulation */
float simulate_network(t_network *nw);
#endif

simulate_network.cu (which contains the kernels and a small temporary main)

#include <cuda.h>
#include "NN.hpp"
#define cudaErrorAbort(msg) \
 { \
 cudaError_t __err = cudaGetLastError(); \
 if (__err != cudaSuccess) { \
 fprintf(stderr, "[CUDA Error] %s : %s (%s:%d)\n", \
 msg, cudaGetErrorString(__err), \
 __FILE__, __LINE__); \
 exit(1); \
 } \
 }
/* First Kernel
 * ============
 * Neuronal parallelism on the INPUT group,
 * check if an INPUT neuron should fire or not.
 */
__global__ void check_input(t_neuron *n, int timestep)
{
 int idx = blockIdx.x * blockDim.x + threadIdx.x;
 if (timestep >= n[idx].next_time)
 {
 n[idx].action_potential += n[idx].in_value;
 n[idx].next_time += n[idx].in_time;
 }
}
/* Second Kernel
 * =============
 * Synaptic parallelism, check pre-synaptic neurons
 * and create a spike if AP >= threshold
 */
__global__ void check_synapses(t_neuron *n, t_synapse *s, t_spike *sp, int timestep, int scount)
{
 int idx = blockIdx.x * blockDim.x + threadIdx.x;
 int j;
 if (n[s[idx].id_in].action_potential < n[s[idx].id_in].threshold)
 return ;
 n[s[idx].id_in].carry = 1;
 for (int i = idx * SPIKE_BUFFER; i < idx * SPIKE_BUFFER + SPIKE_BUFFER; ++i)
 {
 if (sp[i].active)
 continue;
 j = i;
 break ;
 }
 sp[j].syn_id = idx;
 sp[j].id_out = s[idx].id_out;
 sp[j].start_t = timestep;
 sp[j].end_t = timestep + s[idx].axonal_delay;
 sp[j].value = n[s[idx].id_in].action_potential * 0.2f * n[s[idx].id_in].type;
 sp[j].active = true;
}
__global__ void carry_reset(t_neuron *n, int ncount)
{
 int idx = blockIdx.x * blockDim.x + threadIdx.x;
 /* Might be faster depending of the architecture */
 // n[idx].action_potential -= (n[idx].carry * n[idx].action_potential);
 // n[idx].carry = 0;
 if (n[idx].carry)
 {
 n[idx].action_potential = 0.0f;
 n[idx].carry = 0;
 }
}
/* Third Kernel
 * ============
 * Process the "multi-circular" buffer by batch of SPIKE_BUFFER, and update
 * the AP of the post-synaptic neuron (+ kill the spike) if needed
 */
__global__ void check_spikes(t_neuron *n, t_spike *sp, int timestep, int scount)
{
 int idx = blockIdx.x * blockDim.x + threadIdx.x;
 for (int i = idx * SPIKE_BUFFER; i < idx * SPIKE_BUFFER + SPIKE_BUFFER; ++i)
 {
 if (timestep >= sp[i].end_t)
 {
 sp[i].active = false;
 sp[i].end_t = INT_MAX; // to avoid the 'continue' branching
 n[sp[i].id_out].action_potential += (sp[i].value * n[sp[i].id_out].weight);
 if (n[sp[i].id_out].action_potential < 0.0f)
 n[sp[i].id_out].action_potential = 0.0f;
 }
 }
}
float simulate_network(t_network *nw)
{
 static t_neuron *neur_cptr = NULL;
 static t_synapse *syn_cptr = NULL;
 static t_spike *spike_cptr = NULL;
 static size_t cp_size = sizeof(t_neuron) * ((nw->neur_count + 511) & ~511);
 static size_t cp_syn_size = sizeof(t_synapse) * ((nw->syn_count + 511) & ~511);
 static size_t cp_spike_size = sizeof(t_spike) * ((nw->syn_count + 511) & ~511) * SPIKE_BUFFER;
 /* add some more space to avoid useless conditions in kernel */
 if (!neur_cptr)
 {
 /* Malloc/cpy for neurons/synapses/spikes */
 cudaMalloc((void**) &neur_cptr, cp_size);
 cudaMemcpy(neur_cptr, nw->neurons, cp_size, cudaMemcpyHostToDevice);
 cudaMalloc((void**) &syn_cptr, cp_syn_size);
 cudaMemcpy(syn_cptr, nw->synapses, cp_syn_size, cudaMemcpyHostToDevice);
 cudaMalloc((void**) &spike_cptr, cp_spike_size);
 cudaMemcpy(spike_cptr, nw->spikes, cp_spike_size, cudaMemcpyHostToDevice);
 cudaErrorAbort("CUDA Malloc/Memcpy Error.");
 }
 /* Block size/N for kernel 1, 2 and 3 */
 int syn_block_size = 512 > nw->syn_count ? nw->syn_count : 512;
 int syn_block_count = nw->syn_count / syn_block_size
 + (!(nw->syn_count % syn_block_size) ? 0 : 1);
 int carry_block_size = 512 > nw->neur_count ? nw->neur_count : 512;
 int carry_block_count = nw->neur_count / carry_block_size
 + (!(nw->neur_count % carry_block_size) ? 0 : 1);
 cudaEvent_t start, stop;
 float elapsed_time;
 cudaEventCreate(&start);
 cudaEventRecord(start, 0);
 /* Kernel call */
 check_input <<< 1, nw->group_size >>> (neur_cptr, nw->timestep);
 cudaDeviceSynchronize();
 cudaErrorAbort("CUDA Kernel Error 1.");
 check_synapses <<< syn_block_count, syn_block_size >>> (neur_cptr,
 syn_cptr, spike_cptr, nw->timestep, nw->syn_count);
 cudaDeviceSynchronize();
 cudaErrorAbort("CUDA Kernel Error 2.");
 carry_reset <<< carry_block_count, carry_block_size >>> (neur_cptr, nw->neur_count);
 cudaDeviceSynchronize();
 cudaErrorAbort("CUDA Kernel Error 3.");
 check_spikes <<< syn_block_count, syn_block_size >>> (neur_cptr,
 spike_cptr, nw->timestep, nw->syn_count);
 cudaDeviceSynchronize();
 cudaErrorAbort("CUDA Kernel Error 4.");
 /* Timer */
 cudaEventCreate(&stop); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed_time, start, stop);
 printf("[%04d] Elapsed time: %f ms\n\n", nw->timestep, elapsed_time);
 cudaEventCreate(&start); cudaEventRecord(start, 0);
 /* Cpy back */
 cudaMemcpy(nw->neurons, neur_cptr, cp_size, cudaMemcpyDeviceToHost);
 cudaMemcpy(nw->synapses, syn_cptr, cp_syn_size, cudaMemcpyDeviceToHost);
 cudaMemcpy(nw->spikes, spike_cptr, cp_spike_size, cudaMemcpyDeviceToHost);
 cudaErrorAbort("CUDA Memcpy Error.");
 return (elapsed_time);
}
int main(void)
{
 t_network *nw;
 srand(time(NULL));
 if (!(nw = generate_network()))
 return (1);
 while (43)
 {
 simulate_network(nw);
 ++nw->timestep;
 }
 // return (0);
}

generate_network.cpp (which initialize my structures)

#include <stdlib.h>
#include <time.h>
#include "NN.hpp"
int randab(int a, int b)
{
 return (rand() % (b - a) + a);
}
double frandab(double a, double b)
{
 return ((rand() / (double)RAND_MAX) * (b - a) + a);
}
void init_neuron(t_neuron_info *nfo, t_neuron *ret, int id, int x, int y,
 char type)
{
 nfo->x = x;
 nfo->y = y;
 nfo->z = 0;
 nfo->gid = id / 100;
 ret->in_value = frandab(0.8f, 3.0f);
 ret->in_time = randab(50, 100);
 ret->next_time = ret->in_time * frandab(0.2f, 1.0f);
 ret->action_potential = 0.0f;
 ret->threshold = frandab(6.5f, 10.5f);
 ret->weight = frandab(0.3f, 1.2f);
 ret->type = type;
 nfo->group = (nfo->gid == 0 ? INPUT : (nfo->gid == 1 ? OUTPUT : HIDDEN));
 ret->carry = 0;
}
void init_synapse(t_synapse *ret, int in, int out)
{
 ret->id_in = in;
 ret->id_out = out;
 ret->axonal_delay = randab(5, 30);
}
t_network *generate_network(void)
{
 t_network *ret;
 if (!(ret = (t_network*)malloc(sizeof(t_network))))
 return (NULL);
 ret->timestep = 0;
 ret->group_size = 100;
 /* Alloc */
 ret->neur_count = 20000;
 if (!(ret->neurons = (t_neuron*)malloc(sizeof(t_neuron)
 * ((ret->neur_count + 511) & ~511)))
 || !(ret->neurons_info = (t_neuron_info*)malloc(sizeof(t_neuron_info)
 * ((ret->neur_count + 511) & ~511))))
 return (NULL);
 ret->syn_count = 400000;
 if (!(ret->synapses = (t_synapse*)malloc(sizeof(t_synapse)
 * ((ret->syn_count + 511) & ~511))))
 return (NULL);
 if (!(ret->spikes = (t_spike*)malloc(sizeof(t_spike)
 * ((ret->syn_count + 511) & ~511) * SPIKE_BUFFER)))
 return (NULL);
 /* Assign */
 for (int i = 0; i < ret->neur_count; ++i)
 init_neuron(&(ret->neurons_info[i]), &(ret->neurons[i]), i,
 (i / 12) * 11, (i % 60) * 18,
 randab(0, 100) >= STIMULUS_RATIO ? INHIBITION : STIMULUS);
 for (int i = 0; i < ret->syn_count; ++i)
 {
 int in = randab(0, ret->neur_count);
 int out = (randab(0, 2) || ret->neurons_info[in].gid == INPUT
 ? randab(0, ret->neur_count)
 : randab((int)(in / ret->group_size) * ret->group_size,
 ((int)(in / ret->group_size) + 1) * ret->group_size));
 while (in == out)
 out = randab(0, ret->neur_count);
 init_synapse(&(ret->synapses[i]), in, out);
 }
 for (int i = 0; i < ret->syn_count * 4; ++i)
 ret->spikes[i].active = false;
 return (ret);
}

Everything should run nicely using:

nvcc generate_network.cpp simulate_network.cu
Jamal
35.2k13 gold badges134 silver badges238 bronze badges
asked Jul 5, 2015 at 14:40
\$\endgroup\$

1 Answer 1

3
\$\begingroup\$

As a first step, remove as many conditional branches as possible. Take a functional programming approach. You added a lot of conditional returns for error checking that can be removed if your arrays are set up to accommodate all inputs.

Conversion to functional programming example:

if (n[idx].carry)
{
 n[idx].action_potential = 0.0f;
 n[idx].carry = 0;
}

becomes:

n[idx].action_potential = n[idx].action_potential - (n[idx].carry * n[idx].action_potential);
n[idx].carry = 0;
JaDogg
4,5513 gold badges29 silver badges65 bronze badges
answered Jul 5, 2015 at 16:31
\$\endgroup\$
2
  • \$\begingroup\$ Thanks, I'll take a look at my error checking -- in this example, won't it be a loss to add maths to remove a condition which is at the end of the function anyway? \$\endgroup\$ Commented Jul 5, 2015 at 16:39
  • \$\begingroup\$ I try this, and unfortunately, I lose 0.02ms. -- and I don't really find any other condition that aren't essential. \$\endgroup\$ Commented Jul 5, 2015 at 17:28

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.