Return to Question

#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <helper_cuda.h>
#include <math.h>
__global__ void CUDACalculation::vectorIncreaseOne(const long double* input, long double* output, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (input[i] < 255)
 {
 output[i] = input[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}

__global__ void CUDACalculation::vectorIncreaseOne(const long double* input, long double* output, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (input[i] < 255)
 {
 output[i] = input[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}

#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <helper_cuda.h>
#include <math.h>
__global__ void CUDACalculation::vectorIncreaseOne(const long double* input, long double* output, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (input[i] < 255)
 {
 output[i] = input[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}

Update question description

Source Link

edited Feb 27, 2021 at 0:04

JimmyHu

edited Feb 27, 2021 at 0:04

JimmyHu

7.4k
2
10
47

The experimental implementation

The experimental implementation of gpuIncreaseOne function is as below.

__global__ void CUDACalculation::vectorIncreaseOne(const long double* InputVectorinput, long double* OutputVectoroutput, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (InputVector[i]input[i] < 255)
 {
 OutputVector[i]output[i] = InputVector[i]input[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}

__global__ void CUDACalculation::vectorIncreaseOne(const long double* InputVector, long double* OutputVector, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (InputVector[i] < 255)
 {
 OutputVector[i] = InputVector[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}

The experimental implementation

The experimental implementation of gpuIncreaseOne function is as below.

__global__ void CUDACalculation::vectorIncreaseOne(const long double* input, long double* output, int numElements)
{
 int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i < numElements)
 {
 if (input[i] < 255)
 {
 output[i] = input[i] + 1;
 }
 }
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
 // Error code to check return values for CUDA calls
 cudaError_t err = cudaSuccess;
 // Print the vector length to be used, and compute its size
 int numElements = size;
 size_t DataSize = numElements * sizeof(float);
 // Allocate the device input vector A
 float *d_A = NULL;
 err = cudaMalloc((void **)&d_A, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device input vector B
 float *d_B = NULL;
 err = cudaMalloc((void **)&d_B, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Allocate the device output vector C
 float *d_C = NULL;
 err = cudaMalloc((void **)&d_C, DataSize);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the host input vectors A and B in host memory to the device input vectors in
 // device memory
 err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Launch the Vector Add CUDA Kernel
 int threadsPerBlock = 256;
 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
 err = cudaGetLastError();
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Copy the device result vector in device memory to the host result vector
 // in host memory.
 err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 // Free device global memory
 err = cudaFree(d_A);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_B);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 err = cudaFree(d_C);
 if (err != cudaSuccess)
 {
 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
 exit(EXIT_FAILURE);
 }
 return 0;
}

Source Link

asked Feb 26, 2021 at 23:58

JimmyHu

asked Feb 26, 2021 at 23:58

JimmyHu

7.4k
2
10
47

lang-cpp