A "policy-based" design for a generic CUDA kernel

Question 1

I am faced with a design issue that has been discussed on SO several times, the most similar question being this one. Essentially, I want polymorphism for a CUDA kernel in the form of a "generic" kernel that can apply a "user-supplied" function to every element in an array.

In C++, one could achieve this design by using abstract base classes. Here's a minimal example:

#include<iostream>
#include<vector>
class AbstractFunction{
 public:
 virtual double EvalFunction(double) const = 0;
 virtual std::vector<double> operator()(std::vector<double>) const;
};
std::vector<double> AbstractFunction::operator()(std::vector<double> x) const{
 std::vector<double> y;
 for(auto it=x.begin();it!=x.end();++it) 
 y.push_back(EvalFunction(*it));
 return y;
}
class ConcreteFunction : public AbstractFunction{
 public:
 double EvalFunction(double x) const {return x*x;}
}; 
int main()
{
 ConcreteFunction f;
 std::vector<double> x(2,1);
 x[0] = 1.0; x[1] = 2.0;
 std::vector<double> y = f(x);
 for(auto it=y.begin();it!=y.end();++it)
 std::cout<<*it<<std::endl;
 return 0;
}

As it turns out, this design is nearly impossible to port to CUDA, because CUDA doesn't play nicely with abstract base classes. Fine. After some digging (this question and this question were very useful), I have come up with the following design, which seems to work quite well, but I'd like to hear some critiques:

#include<iostream>
// The Managed class allows us to use CUDA unified memory. 
// See http://devblogs.nvidia.com/parallelforall/unified-memory-in-cuda-6/
class Managed {
public:
 void *operator new(size_t len) {
 void *ptr;
 cudaMallocManaged(&ptr, len);
 cudaDeviceSynchronize();
 return ptr;
 }
 void operator delete(void *ptr) {
 cudaDeviceSynchronize();
 cudaFree(ptr);
 }
};
// Evaluate function on GPU
template<class FunType>
__global__ void EvalKernelGPU(double* x,double* y,int N,FunType* f)
{
 unsigned int tId = blockIdx.x*blockDim.x + threadIdx.x;
 if(tId<N)
 y[tId] = (*f)(x[tId]);
}
//Evaluator class: "policy-based" template design
template<class FunType>
class Evaluator : public FunType
{
 public: 
 Evaluator() {f = new FunType;}
 Evaluator(FunType* fun_in) {f = fun_in;}
 double* EvalFunGPU(double* x,int N){
 double* y;
 cudaMallocManaged(&y,N*sizeof(double));
 dim3 block(512,1);dim3 grid((N+block.x-1)/block.x,1);
 EvalKernelGPU<FunType><<<grid,block>>>(x,y,N,f);
 cudaDeviceSynchronize();
 return y;
 }
 private: 
 FunType* f;
};
//"User-defined" special function, to be evaluated
struct SpecialFun : public Managed
{
 __device__ __host__ double operator()(double x){return x*x;}
};
int main()
{
 Evaluator<SpecialFun> E;
 double* x;
 cudaMallocManaged(&x,2*sizeof(double));
 x[0] = 1.0; x[1] = 2.0;
 double* y = E.EvalFunGPU(x,2);
 std::cout<<"y[0] = "<<y[0]<<"\ny[1] = "<<y[1]<<std::endl;
 return 0;
}

I have some obvious improvements lined up, but I wanted to get input on my basic design before building it out. Thanks for your time!

Question 2

Is there a good reason against using the thrust library? It seems to come with the nvcc compiler and provides that kind of functionality:

#include <iostream>
#include <thrust/transform.h>
__device__ __managed__ float x[2], y[2];
template <typename T> struct square {
 __host__ __device__ T operator()(const T& x) const {
 return x * x;
 }
};
int main(int argc, char const *argv[]) {
 x[0] = 1; x[1] = 2;
 thrust::transform(x, x + 2, y, square<float>());
 std::cout << x[0] << "**2 = " << y[0] << "\n";
 std::cout << x[1] << "**2 = " << y[1];
 return 0;
}

Question 3

I think I tried this a few times and just couldn't get it to work for some reason. I'll take another look though, thanks!

germannp germannp 1213 bronze badges · Answer 1 · 2016-05-06 15:44:20Z

Is there a good reason against using the thrust library? It seems to come with the nvcc compiler and provides that kind of functionality:

#include <iostream>
#include <thrust/transform.h>
__device__ __managed__ float x[2], y[2];
template <typename T> struct square {
 __host__ __device__ T operator()(const T& x) const {
 return x * x;
 }
};
int main(int argc, char const *argv[]) {
 x[0] = 1; x[1] = 2;
 thrust::transform(x, x + 2, y, square<float>());
 std::cout << x[0] << "**2 = " << y[0] << "\n";
 std::cout << x[1] << "**2 = " << y[1];
 return 0;
}

I think I tried this a few times and just couldn't get it to work for some reason. I'll take another look though, thanks!

Stack Exchange Network

A "policy-based" design for a generic CUDA kernel

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

A "policy-based" design for a generic CUDA kernel

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions