CUDA Kernel - Neural Net

Question 1

I'm building a spiking neural net (recurrent, integrate and fire), and I'm curious about how to reduce the warp divergence (and other problems) I may have. Here's an example with a few hand-placed neurons and synapses for a better apprehension. I upload the whole code on a Git repo for faster access, make tui then ./cudasnn to try.

The very basic workflow is to execute the 4 kernels (explanation in the comments) one after another.

After 5000 cycles, here's the time in ms for each kernel in order:

1 - 0.196ms
2 - 3.558ms
3 - 0.038ms
4 - 4.416ms to 6.278ms

My code is split into three files, whose name are pretty explicit.

NN.hpp (which contains my structures)

#ifndef NN_HPP
# define NN_HPP
# include <iostream>
/* Window Setting -- for SFML, no need here */
# define WIDTH 1280
# define HEIGHT 720
# define XOFFSET -0
# define YOFFSET -95
/* Network Settings */
# define STIMULUS 1
# define INHIBITION -1
# define STIMULUS_RATIO 80
# define SPIKE_BUFFER 4
# define INPUT 0
# define OUTPUT 1
# define HIDDEN 2
typedef struct s_neuron_info
{
 float x, y, z;
 char gid;
 unsigned char group; // hidden, input, output
} t_neuron_info;
typedef struct s_neuron
{
 short in_time;
 float in_value;
 int next_time;
 float action_potential;
 float threshold; // fire when threshold reached
 float weight;
 char type; // stimulus, inhibition
 char carry;
} t_neuron;
typedef struct s_synapse
{
 int id_in;
 int id_out;
 int axonal_delay; // in timestep
} t_synapse;
typedef struct s_spike
{
 int syn_id;
 int id_out;
 int start_t, end_t; // in timestep
 float value;
 bool active;
} t_spike;
typedef struct s_network
{
 t_neuron *neurons;
 t_neuron_info *neurons_info;
 t_synapse *synapses;
 t_spike *spikes;
 int neur_count;
 int syn_count;
 int timestep;
 int group_size;
} t_network;
/* Generation */
t_network *generate_network(void);
/* GPU Simulation */
float simulate_network(t_network *nw);
#endif

simulate_network.cu (which contains the kernels and a small temporary main)

#include <cuda.h>
#include "NN.hpp"
#define cudaErrorAbort(msg) \
 { \
 cudaError_t __err = cudaGetLastError(); \
 if (__err != cudaSuccess) { \
 fprintf(stderr, "[CUDA Error] %s : %s (%s:%d)\n", \
 msg, cudaGetErrorString(__err), \
 __FILE__, __LINE__); \
 exit(1); \
 } \
 }
/* First Kernel
 * ============
 * Neuronal parallelism on the INPUT group,
 * check if an INPUT neuron should fire or not.
 */
__global__ void check_input(t_neuron *n, int timestep)
{
 int idx = blockIdx.x * blockDim.x + threadIdx.x;
 if (timestep >= n[idx].next_time)
 {
 n[idx].action_potential += n[idx].in_value;
 n[idx].next_time += n[idx].in_time;
 }
}
/* Second Kernel
 * =============
 * Synaptic parallelism, check pre-synaptic neurons
 * and create a spike if AP >= threshold
 */
__global__ void check_synapses(t_neuron *n, t_synapse *s, t_spike *sp, int timestep, int scount)
{
 int idx = blockIdx.x * blockDim.x + threadIdx.x;
 int j;
 if (n[s[idx].id_in].action_potential < n[s[idx].id_in].threshold)
 return ;
 n[s[idx].id_in].carry = 1;
 for (int i = idx * SPIKE_BUFFER; i < idx * SPIKE_BUFFER + SPIKE_BUFFER; ++i)
 {
 if (sp[i].active)
 continue;
 j = i;
 break ;
 }
 sp[j].syn_id = idx;
 sp[j].id_out = s[idx].id_out;
 sp[j].start_t = timestep;
 sp[j].end_t = timestep + s[idx].axonal_delay;
 sp[j].value = n[s[idx].id_in].action_potential * 0.2f * n[s[idx].id_in].type;
 sp[j].active = true;
}
__global__ void carry_reset(t_neuron *n, int ncount)
{
 int idx = blockIdx.x * blockDim.x + threadIdx.x;
 /* Might be faster depending of the architecture */
 // n[idx].action_potential -= (n[idx].carry * n[idx].action_potential);
 // n[idx].carry = 0;
 if (n[idx].carry)
 {
 n[idx].action_potential = 0.0f;
 n[idx].carry = 0;
 }
}
/* Third Kernel
 * ============
 * Process the "multi-circular" buffer by batch of SPIKE_BUFFER, and update
 * the AP of the post-synaptic neuron (+ kill the spike) if needed
 */
__global__ void check_spikes(t_neuron *n, t_spike *sp, int timestep, int scount)
{
 int idx = blockIdx.x * blockDim.x + threadIdx.x;
 for (int i = idx * SPIKE_BUFFER; i < idx * SPIKE_BUFFER + SPIKE_BUFFER; ++i)
 {
 if (timestep >= sp[i].end_t)
 {
 sp[i].active = false;
 sp[i].end_t = INT_MAX; // to avoid the 'continue' branching
 n[sp[i].id_out].action_potential += (sp[i].value * n[sp[i].id_out].weight);
 if (n[sp[i].id_out].action_potential < 0.0f)
 n[sp[i].id_out].action_potential = 0.0f;
 }
 }
}
float simulate_network(t_network *nw)
{
 static t_neuron *neur_cptr = NULL;
 static t_synapse *syn_cptr = NULL;
 static t_spike *spike_cptr = NULL;
 static size_t cp_size = sizeof(t_neuron) * ((nw->neur_count + 511) & ~511);
 static size_t cp_syn_size = sizeof(t_synapse) * ((nw->syn_count + 511) & ~511);
 static size_t cp_spike_size = sizeof(t_spike) * ((nw->syn_count + 511) & ~511) * SPIKE_BUFFER;
 /* add some more space to avoid useless conditions in kernel */
 if (!neur_cptr)
 {
 /* Malloc/cpy for neurons/synapses/spikes */
 cudaMalloc((void**) &neur_cptr, cp_size);
 cudaMemcpy(neur_cptr, nw->neurons, cp_size, cudaMemcpyHostToDevice);
 cudaMalloc((void**) &syn_cptr, cp_syn_size);
 cudaMemcpy(syn_cptr, nw->synapses, cp_syn_size, cudaMemcpyHostToDevice);
 cudaMalloc((void**) &spike_cptr, cp_spike_size);
 cudaMemcpy(spike_cptr, nw->spikes, cp_spike_size, cudaMemcpyHostToDevice);
 cudaErrorAbort("CUDA Malloc/Memcpy Error.");
 }
 /* Block size/N for kernel 1, 2 and 3 */
 int syn_block_size = 512 > nw->syn_count ? nw->syn_count : 512;
 int syn_block_count = nw->syn_count / syn_block_size
 + (!(nw->syn_count % syn_block_size) ? 0 : 1);
 int carry_block_size = 512 > nw->neur_count ? nw->neur_count : 512;
 int carry_block_count = nw->neur_count / carry_block_size
 + (!(nw->neur_count % carry_block_size) ? 0 : 1);
 cudaEvent_t start, stop;
 float elapsed_time;
 cudaEventCreate(&start);
 cudaEventRecord(start, 0);
 /* Kernel call */
 check_input <<< 1, nw->group_size >>> (neur_cptr, nw->timestep);
 cudaDeviceSynchronize();
 cudaErrorAbort("CUDA Kernel Error 1.");
 check_synapses <<< syn_block_count, syn_block_size >>> (neur_cptr,
 syn_cptr, spike_cptr, nw->timestep, nw->syn_count);
 cudaDeviceSynchronize();
 cudaErrorAbort("CUDA Kernel Error 2.");
 carry_reset <<< carry_block_count, carry_block_size >>> (neur_cptr, nw->neur_count);
 cudaDeviceSynchronize();
 cudaErrorAbort("CUDA Kernel Error 3.");
 check_spikes <<< syn_block_count, syn_block_size >>> (neur_cptr,
 spike_cptr, nw->timestep, nw->syn_count);
 cudaDeviceSynchronize();
 cudaErrorAbort("CUDA Kernel Error 4.");
 /* Timer */
 cudaEventCreate(&stop); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed_time, start, stop);
 printf("[%04d] Elapsed time: %f ms\n\n", nw->timestep, elapsed_time);
 cudaEventCreate(&start); cudaEventRecord(start, 0);
 /* Cpy back */
 cudaMemcpy(nw->neurons, neur_cptr, cp_size, cudaMemcpyDeviceToHost);
 cudaMemcpy(nw->synapses, syn_cptr, cp_syn_size, cudaMemcpyDeviceToHost);
 cudaMemcpy(nw->spikes, spike_cptr, cp_spike_size, cudaMemcpyDeviceToHost);
 cudaErrorAbort("CUDA Memcpy Error.");
 return (elapsed_time);
}
int main(void)
{
 t_network *nw;
 srand(time(NULL));
 if (!(nw = generate_network()))
 return (1);
 while (43)
 {
 simulate_network(nw);
 ++nw->timestep;
 }
 // return (0);
}

generate_network.cpp (which initialize my structures)

#include <stdlib.h>
#include <time.h>
#include "NN.hpp"
int randab(int a, int b)
{
 return (rand() % (b - a) + a);
}
double frandab(double a, double b)
{
 return ((rand() / (double)RAND_MAX) * (b - a) + a);
}
void init_neuron(t_neuron_info *nfo, t_neuron *ret, int id, int x, int y,
 char type)
{
 nfo->x = x;
 nfo->y = y;
 nfo->z = 0;
 nfo->gid = id / 100;
 ret->in_value = frandab(0.8f, 3.0f);
 ret->in_time = randab(50, 100);
 ret->next_time = ret->in_time * frandab(0.2f, 1.0f);
 ret->action_potential = 0.0f;
 ret->threshold = frandab(6.5f, 10.5f);
 ret->weight = frandab(0.3f, 1.2f);
 ret->type = type;
 nfo->group = (nfo->gid == 0 ? INPUT : (nfo->gid == 1 ? OUTPUT : HIDDEN));
 ret->carry = 0;
}
void init_synapse(t_synapse *ret, int in, int out)
{
 ret->id_in = in;
 ret->id_out = out;
 ret->axonal_delay = randab(5, 30);
}
t_network *generate_network(void)
{
 t_network *ret;
 if (!(ret = (t_network*)malloc(sizeof(t_network))))
 return (NULL);
 ret->timestep = 0;
 ret->group_size = 100;
 /* Alloc */
 ret->neur_count = 20000;
 if (!(ret->neurons = (t_neuron*)malloc(sizeof(t_neuron)
 * ((ret->neur_count + 511) & ~511)))
 || !(ret->neurons_info = (t_neuron_info*)malloc(sizeof(t_neuron_info)
 * ((ret->neur_count + 511) & ~511))))
 return (NULL);
 ret->syn_count = 400000;
 if (!(ret->synapses = (t_synapse*)malloc(sizeof(t_synapse)
 * ((ret->syn_count + 511) & ~511))))
 return (NULL);
 if (!(ret->spikes = (t_spike*)malloc(sizeof(t_spike)
 * ((ret->syn_count + 511) & ~511) * SPIKE_BUFFER)))
 return (NULL);
 /* Assign */
 for (int i = 0; i < ret->neur_count; ++i)
 init_neuron(&(ret->neurons_info[i]), &(ret->neurons[i]), i,
 (i / 12) * 11, (i % 60) * 18,
 randab(0, 100) >= STIMULUS_RATIO ? INHIBITION : STIMULUS);
 for (int i = 0; i < ret->syn_count; ++i)
 {
 int in = randab(0, ret->neur_count);
 int out = (randab(0, 2) || ret->neurons_info[in].gid == INPUT
 ? randab(0, ret->neur_count)
 : randab((int)(in / ret->group_size) * ret->group_size,
 ((int)(in / ret->group_size) + 1) * ret->group_size));
 while (in == out)
 out = randab(0, ret->neur_count);
 init_synapse(&(ret->synapses[i]), in, out);
 }
 for (int i = 0; i < ret->syn_count * 4; ++i)
 ret->spikes[i].active = false;
 return (ret);
}

Everything should run nicely using:

nvcc generate_network.cpp simulate_network.cu

Question 2

As a first step, remove as many conditional branches as possible. Take a functional programming approach. You added a lot of conditional returns for error checking that can be removed if your arrays are set up to accommodate all inputs.

Conversion to functional programming example:

if (n[idx].carry)
{
 n[idx].action_potential = 0.0f;
 n[idx].carry = 0;
}

becomes:

n[idx].action_potential = n[idx].action_potential - (n[idx].carry * n[idx].action_potential);
n[idx].carry = 0;

Question 3

Thanks, I'll take a look at my error checking -- in this example, won't it be a loss to add maths to remove a condition which is at the end of the function anyway?

Question 4

I try this, and unfortunately, I lose 0.02ms. -- and I don't really find any other condition that aren't essential.

Jingleberry Jingleberry 211 bronze badge · Answer 1 · 2015-07-05 16:31:50Z

As a first step, remove as many conditional branches as possible. Take a functional programming approach. You added a lot of conditional returns for error checking that can be removed if your arrays are set up to accommodate all inputs.

Conversion to functional programming example:

if (n[idx].carry)
{
 n[idx].action_potential = 0.0f;
 n[idx].carry = 0;
}

becomes:

n[idx].action_potential = n[idx].action_potential - (n[idx].carry * n[idx].action_potential);
n[idx].carry = 0;

Thanks, I'll take a look at my error checking -- in this example, won't it be a loss to add maths to remove a condition which is at the end of the function anyway?
I try this, and unfortunately, I lose 0.02ms. -- and I don't really find any other condition that aren't essential.

Stack Exchange Network

CUDA Kernel - Neural Net

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

CUDA Kernel - Neural Net

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions