Generating prime numbers using Sieve of Eratosthenes with CUDA

Question 1

I'm learning CUDA and wrote a little program which generates prime numbers using the Sieve of Eratosthenes. (I know the limitations of CUDA, specially with memory sizes and limits, but this program is for educational purposes).

Questions:

Did I set up the configuration correctly? (did I set dimBlock and dimGrid correctly?)
In my kernel I seem to set a maxRoot constant variable for each thread spawned. Is there a way to set that once in the GPU and then have that shared across all threads??
In the given code below, on line 23 - there's optimization #2 - sieve only odds. How can I apply that to my kernel? The way I'm currently doing is spawning threads from 1 to sqrt(max) and assigning each of them (so much like looping, but incrementing i by 1 each time. In that line of code, we see that it starts at 3 and increments by 2 in the for loop.)
Can you give me any other feedback on my code? What else am I doing wrong, or what else could be improved?

Any kind of feedback and improvements is good.

This is the HOST code for the sieve I attempted to implement in CUDA:

void sieveOfEratosthenes(uint64_t max)
{
 // There are no prime numbers smaller than 2 
 if (max >= 2)
 {
 max++;
 // Create an array of size n and initialize all elements as 0 
 char arr[max]; // could call calloc() and return a pointer 
 memset(arr, 0, sizeof(char) * max);
 arr[0] = 1;
 arr[1] = 1;
 uint64_t maxRoot = sqrt(max); // optimization #1 
 uint64_t i;
 int j; // sieve multiples of two 
 for (j = 2 * 2; j < max; j += 2)
 {
 arr[j] = 1;
 }
 /* for (i = 2; i <= maxRoot; i++) */
 for (i = 3; i <= maxRoot; i += 2) // optimization #2 - sieve only odds 
 {
 if (arr[i] == 0 )
 {
 int j;
 for (j = i * i; j < max; j += i)
 {
 arr[j] = 1;
 }
 }
 }
 // display 
 for (i = 0; i < max; i++)
 if (arr[i] == 0)
 printf("%d ", i);
 printf("\n");
 }
}

This is my actual CUDA file: prime.cu

#include <stdio.h>
#include <helper_cuda.h> // checkCudaErrors()
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
typedef unsigned long long int uint64_t;
/******************************************************************************
* kernel for finding prime numbers using the sieve of eratosthenes
* - primes: an array of bools. initially all numbers are set to "0".
* A "0" value means that the number at that index is prime.
* - max: the max size of the primes array
******************************************************************************/
__global__ static void sieveOfEratosthenesCUDA(char *primes, uint64_t max)
{
 // first thread 0
 if (threadIdx.x == 0 && threadIdx.y == 0)
 {
 primes[0] = 1; // value of 1 means the number is NOT prime
 primes[1] = 1; // numbers "0" and "1" are not prime numbers
 // sieve multiples of two
 for (int j = 2 * 2; j < max; j += 2)
 {
 primes[j] = 1;
 }
 }
 else
 {
 int index = blockIdx.x * blockDim.x + threadIdx.x;
 const uint64_t maxRoot = sqrt((double)max);
 // make sure index won't go out of bounds, also don't execute it
 // on index 1
 if (index < maxRoot && primes[index] == 0 && index > 1 )
 {
 // mark off the composite numbers
 for (int j = index * index; j < max; j += index)
 {
 primes[j] = 1;
 }
 }
 }
}
/******************************************************************************
* checkDevice()
******************************************************************************/
__host__ int checkDevice()
{
 // query the Device and decide on the block size
 int devID = 0; // the default device ID
 cudaError_t error;
 cudaDeviceProp deviceProp;
 error = cudaGetDevice(&devID);
 if (error != cudaSuccess)
 {
 printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
 exit(EXIT_FAILURE);
 }
 error = cudaGetDeviceProperties(&deviceProp, devID);
 if (deviceProp.computeMode == cudaComputeModeProhibited || error != cudaSuccess)
 {
 printf("CUDA device ComputeMode is prohibited or failed to getDeviceProperties\n");
 return EXIT_FAILURE;
 }
 // Use a larger block size for Fermi and above (see compute capability)
 return (deviceProp.major < 2) ? 16 : 32;
}
/******************************************************************************
* genPrimesOnDevice
* - inputs: limit - the largest prime that should be computed
* primes - an array of size [limit], initialized to 0
******************************************************************************/
__host__ void genPrimesOnDevice(char* primes, uint64_t max)
{
 int blockSize = checkDevice();
 if (blockSize == EXIT_FAILURE)
 return;
 char* d_Primes = NULL;
 int sizePrimes = sizeof(char) * max;
 uint64_t maxRoot = sqrt(max);
 // allocate the primes on the device and set them to 0
 checkCudaErrors(cudaMalloc(&d_Primes, sizePrimes));
 checkCudaErrors(cudaMemset(d_Primes, 0, sizePrimes));
 // make sure that there are no errors...
 checkCudaErrors(cudaPeekAtLastError());
 // setup the execution configuration
 dim3 dimBlock(maxRoot, 1, 1);
 dim3 dimGrid(1);
 //////// debug
 #ifdef DEBUG
 printf("dimBlock(%d, %d, %d)\n", dimBlock.x, dimBlock.y, dimBlock.z);
 printf("dimGrid(%d, %d, %d)\n", dimGrid.x, dimGrid.y, dimGrid.z);
 #endif
 // call the kernel
 sieveOfEratosthenesCUDA<<<dimGrid, dimBlock>>>(d_Primes, max);
 // check for kernel errors
 checkCudaErrors(cudaPeekAtLastError());
 checkCudaErrors(cudaDeviceSynchronize());
 // copy the results back
 checkCudaErrors(cudaMemcpy(primes, d_Primes, sizePrimes, cudaMemcpyDeviceToHost));
 // no memory leaks
 checkCudaErrors(cudaFree(d_Primes));
}
/******************************************************************************
*
******************************************************************************/
int main()
{
 uint64_t maxPrime = 102; // find all primes from 0-101
 char* primes = (char*) malloc(maxPrime);
 memset(primes, 0, maxPrime); // initialize all elements to 0
 genPrimesOnDevice(primes, maxPrime);
 // display the results
 int i;
 for (i = 0; i < maxPrime; i++)
 if (primes[i] == 0)
 printf("%i ", i);
 printf("\n");
 free(primes);
 return 0;
}

Question 2

you shouldn't use the same incrementation variable twice in the same scope, it can get messy and cause weird bugs and you do it with two different incrementing variables, i and j.

 uint64_t maxRoot = sqrt(max); // optimization #1 
 uint64_t i;
 int j; // sieve multiples of two 
 for (j = 2 * 2; j < max; j += 2)
 {
 arr[j] = 1;
 }
 /* for (i = 2; i <= maxRoot; i++) */
 for (i = 3; i <= maxRoot; i += 2) // optimization #2 - sieve only odds 
 {
 if (arr[i] == 0 )
 {
 int j;
 for (j = i * i; j < max; j += i)
 {
 arr[j] = 1;
 }
 }
 }
 // display 
 for (i = 0; i < max; i++)
 if (arr[i] == 0)
 printf("%d ", i);
 printf("\n");

you actually initialize j twice inside the same scope.

if you were using the resulting i and j values in the next loop it would be a totally different story.

the other thing that you are doing is you declare the i variable as a uint64_t but the j is a plain old integer, with sign.

I think you should stay consistent and declare the j variable as a uint64_t as well.

I think you might also come up against some issues with your array of char's they are quite a bit smaller than your uint64_t variables and so when you start getting the bigger primes it will error out because it can't hold those values, not because the application ran out of memory. I think you should change it to something bigger or at least an unsigned char. I know a char is basically an integer so why not just declare the array a uint?

Your display code could use some Curly Braces so that it is definite what is supposed to be going on. imagine writing tons of lines of code and trying to read it when you think you may have forgotten to include something in an if or for block and the code won't run correctly... Just use Curly Braces.

 // display 
for (i = 0; i < max; i++)
{
 if (arr[i] == 0)
 {
 printf("%d ", i);
 }
}
printf("\n");

Question 3

looks like you got it figured out though. I know there are a few people that review C code and I am sure they would love to review more of your code as well. I don't do much with C as I am mostly a .NET programmer.

Question 4

Agreed with the importance of curly braces, especially on code ran on the GPU. In regular code you can relatively easily debug, but on GPU code you absolutely do not want a single bug that you'll need to debug.

Question 5

I'd like to first compliment you for the extensive error-checking. It's harder to debug GPU code as opposed to CPU code, so it's important to utilize CUDA's error-checking functionality.

You shouldn't print errors to standard output, such as with printf(). Instead, you can print to stderr via fprintf():
```
fprintf(stderr, "some error was encountered");
```
Consider putting the array-printing into a separate host function, primarily for more modularity.

Regarding that portion, newlines ("\n") can just be printed via puts().
This check looks a little odd:
```
if (blockSize == EXIT_FAILURE)
```
It would be more readable to compare blockSize to an actual quantity. The EXIT macros are only used for program termination codes. This quantity can also be a macro.
Size variables for CUDA functions (also C library functions) should be of type size_t. This is the type that they take, and you shouldn't mismatch integer types (signed vs unsigned).
The conditionals in sieveOfEratosthenesCUDA() may cause a performance hit from possible thread divergence, especially if they're in the same warp. If this has affected performance, then it may help to split both parts into separate kernel calls.

Malachi Malachi 29k11 gold badges86 silver badges188 bronze badges · Accepted Answer · 2014-08-21 19:19:18Z

you shouldn't use the same incrementation variable twice in the same scope, it can get messy and cause weird bugs and you do it with two different incrementing variables, i and j.

 uint64_t maxRoot = sqrt(max); // optimization #1 
 uint64_t i;
 int j; // sieve multiples of two 
 for (j = 2 * 2; j < max; j += 2)
 {
 arr[j] = 1;
 }
 /* for (i = 2; i <= maxRoot; i++) */
 for (i = 3; i <= maxRoot; i += 2) // optimization #2 - sieve only odds 
 {
 if (arr[i] == 0 )
 {
 int j;
 for (j = i * i; j < max; j += i)
 {
 arr[j] = 1;
 }
 }
 }
 // display 
 for (i = 0; i < max; i++)
 if (arr[i] == 0)
 printf("%d ", i);
 printf("\n");

you actually initialize j twice inside the same scope.

if you were using the resulting i and j values in the next loop it would be a totally different story.

the other thing that you are doing is you declare the i variable as a uint64_t but the j is a plain old integer, with sign.

I think you should stay consistent and declare the j variable as a uint64_t as well.

I think you might also come up against some issues with your array of char's they are quite a bit smaller than your uint64_t variables and so when you start getting the bigger primes it will error out because it can't hold those values, not because the application ran out of memory. I think you should change it to something bigger or at least an unsigned char. I know a char is basically an integer so why not just declare the array a uint?

Your display code could use some Curly Braces so that it is definite what is supposed to be going on. imagine writing tons of lines of code and trying to read it when you think you may have forgotten to include something in an if or for block and the code won't run correctly... Just use Curly Braces.

 // display 
for (i = 0; i < max; i++)
{
 if (arr[i] == 0)
 {
 printf("%d ", i);
 }
}
printf("\n");

looks like you got it figured out though. I know there are a few people that review C code and I am sure they would love to review more of your code as well. I don't do much with C as I am mostly a .NET programmer.
Agreed with the importance of curly braces, especially on code ran on the GPU. In regular code you can relatively easily debug, but on GPU code you absolutely do not want a single bug that you'll need to debug.

Stack Exchange Network

Generating prime numbers using Sieve of Eratosthenes with CUDA

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Generating prime numbers using Sieve of Eratosthenes with CUDA

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions