Skip to main content
Code Review

Return to Question

static_cast<float> is C++ , not c, so changed tag.
Link
Notice removed Draw attention by Community Bot
Bounty Ended with PaulH's answer chosen by Community Bot
Notice added Draw attention by JimmyHu
Bounty Started worth 50 reputation by JimmyHu
Update code block content
Source Link
JimmyHu
  • 7.4k
  • 2
  • 10
  • 47
#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <helper_cuda.h>
#include <math.h>
__global__ void CUDACalculation::vectorIncreaseOne(const long double* input, long double* output, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (input[i] < 255)
 {
 output[i] = input[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}
__global__ void CUDACalculation::vectorIncreaseOne(const long double* input, long double* output, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (input[i] < 255)
 {
 output[i] = input[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}
#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <helper_cuda.h>
#include <math.h>
__global__ void CUDACalculation::vectorIncreaseOne(const long double* input, long double* output, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (input[i] < 255)
 {
 output[i] = input[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}
Update question description
Source Link
JimmyHu
  • 7.4k
  • 2
  • 10
  • 47

The experimental implementation

The experimental implementation of gpuIncreaseOne function is as below.

__global__ void CUDACalculation::vectorIncreaseOne(const long double* InputVectorinput, long double* OutputVectoroutput, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (InputVector[i]input[i] < 255)
 {
 OutputVector[i]output[i] = InputVector[i]input[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}
__global__ void CUDACalculation::vectorIncreaseOne(const long double* InputVector, long double* OutputVector, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (InputVector[i] < 255)
 {
 OutputVector[i] = InputVector[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}

The experimental implementation

The experimental implementation of gpuIncreaseOne function is as below.

__global__ void CUDACalculation::vectorIncreaseOne(const long double* input, long double* output, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (input[i] < 255)
 {
 output[i] = input[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}
Source Link
JimmyHu
  • 7.4k
  • 2
  • 10
  • 47
Loading
lang-cpp

AltStyle によって変換されたページ (->オリジナル) /