2 #ifdef NATIVE_LAPACK_LIB
3 #include <cublas_v2.h>
5 #endif
6
7 //#define _DEBUG
8
9 #ifdef _DEBUG
11 #endif
12
14 {
15
16 namespace blas_lapack
17 {
18
19 namespace native
20 {
21
22 #ifdef NATIVE_LAPACK_LIB
23 static cublasHandle_t handle;
24 #endif
25 static bool cublas_init = false;
26
28 {
29 if (!cublas_init) {
30 #ifdef NATIVE_LAPACK_LIB
31 cublasStatus_t error = cublasCreate(&handle);
32 if (error != CUBLAS_STATUS_SUCCESS)
errorQuda(
"cublasCreate failed with error %d", error);
33 cublas_init = true;
34 #endif
35 }
36 }
37
39 {
40 if (cublas_init) {
41 #ifdef NATIVE_LAPACK_LIB
42 cublasStatus_t error = cublasDestroy(handle);
43 if (error != CUBLAS_STATUS_SUCCESS)
44 errorQuda(
"\nError indestroying cublas context, error code = %d\n", error);
45 cublas_init = false;
46 #endif
47 }
48 }
49
50 #ifdef _DEBUG
51 template <typename EigenMatrix, typename Float>
52 __host__ void checkEigen(std::complex<Float> *A_h, std::complex<Float> *Ainv_h, int n, uint64_t batch)
53 {
54 EigenMatrix A = EigenMatrix::Zero(n, n);
55 EigenMatrix Ainv = EigenMatrix::Zero(n, n);
56 for (int j = 0; j < n; j++) {
57 for (int k = 0; k < n; k++) {
58 A(k, j) = A_h[batch * n * n + j * n + k];
59 Ainv(k, j) = Ainv_h[batch * n * n + j * n + k];
60 }
61 }
62
63 // Check result:
64 EigenMatrix unit = EigenMatrix::Identity(n, n);
65 EigenMatrix prod = A * Ainv;
66 Float L2norm = ((prod - unit).
norm() / (n * n));
67 printfQuda(
"cuBLAS: Norm of (A * Ainv - I) batch %lu = %e\n", batch, L2norm);
68 }
69 #endif
70
71 // FIXME do this in pipelined fashion to reduce memory overhead.
74 {
75 #ifdef NATIVE_LAPACK_LIB
78 printfQuda(
"BatchInvertMatrix (native - cuBLAS): Nc = %d, batch = %lu\n", n, batch);
79
82 gettimeofday(&
start, NULL);
83
84 size_t size = 2 * n * n *
prec * batch;
88
89 #ifdef _DEBUG
90 // Debug code: Copy original A matrix to host
91 std::complex<float> *A_h
93 static_cast<std::complex<float> *>(A_d));
95 #endif
96
100 memset(info_array,
'0', batch *
sizeof(
int));
// silence memcheck warnings
101
103 typedef cuFloatComplex C;
105 C **Ainv_array = A_array + batch;
107 C **Ainv_array_h = A_array_h + batch;
108 for (uint64_t i = 0; i < batch; i++) {
109 A_array_h[i] = static_cast<C *>(A_d) + i * n * n;
110 Ainv_array_h[i] = static_cast<C *>(Ainv_d) + i * n * n;
111 }
112 qudaMemcpy(A_array, A_array_h, 2 * batch *
sizeof(C *), cudaMemcpyHostToDevice);
113
114 cublasStatus_t error = cublasCgetrfBatched(handle, n, A_array, n, dipiv, dinfo_array, batch);
116
117 if (error != CUBLAS_STATUS_SUCCESS)
118 errorQuda(
"\nError in LU decomposition (cublasCgetrfBatched), error code = %d\n", error);
119
120 qudaMemcpy(info_array, dinfo_array, batch *
sizeof(
int), cudaMemcpyDeviceToHost);
121 for (uint64_t i = 0; i < batch; i++) {
122 if (info_array[i] < 0) {
123 errorQuda(
"%lu argument had an illegal value or another error occured, such as memory allocation failed",
124 i);
125 } else if (info_array[i] > 0) {
126 errorQuda(
"%lu factorization completed but the factor U is exactly singular", i);
127 }
128 }
129
130 error = cublasCgetriBatched(handle, n, (const C **)A_array, n, dipiv, Ainv_array, n, dinfo_array, batch);
132
133 if (error != CUBLAS_STATUS_SUCCESS)
134 errorQuda(
"\nError in matrix inversion (cublasCgetriBatched), error code = %d\n", error);
135
136 qudaMemcpy(info_array, dinfo_array, batch *
sizeof(
int), cudaMemcpyDeviceToHost);
137
138 for (uint64_t i = 0; i < batch; i++) {
139 if (info_array[i] < 0) {
140 errorQuda(
"%lu argument had an illegal value or another error occured, such as memory allocation failed",
141 i);
142 } else if (info_array[i] > 0) {
143 errorQuda(
"%lu factorization completed but the factor U is exactly singular", i);
144 }
145 }
146
149
150 #ifdef _DEBUG
151 // Debug code: Copy computed Ainv to host
152 std::complex<float> *Ainv_h =
static_cast<std::complex<float> *
>(
pool_pinned_malloc(size));
153 qudaMemcpy((
void *)Ainv_h, Ainv_d, size, cudaMemcpyDeviceToHost);
154
155 for (uint64_t i = 0; i < batch; i++) { checkEigen<MatrixXcf, float>(A_h, Ainv_h, n, i); }
158 #endif
159 } else {
160 errorQuda(
"%s not implemented for precision=%d", __func__,
prec);
161 }
162
164 qudaMemcpy(Ainv, Ainv_d, size, cudaMemcpyDeviceToHost);
167 }
168
172
174 gettimeofday(&
stop, NULL);
177 double time = ds + 0.000001 * dus;
178
180 printfQuda(
"Batched matrix inversion completed in %f seconds with GFLOPS = %f\n", time, 1e-9 *
flops / time);
181
183 #else
184 errorQuda(
"Native BLAS not built. Please build and use native BLAS or use generic BLAS");
185 return 0; // Stops a compiler warning
186 #endif
187 }
188
189 } // namespace native
190 } // namespace blas_lapack
191 } // namespace quda
#define FLOPS_CGETRF(m_, n_)
void * memset(void *s, int c, size_t n)
enum QudaPrecision_s QudaPrecision
@ QUDA_CUDA_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
enum QudaFieldLocation_s QudaFieldLocation
#define pool_pinned_malloc(size)
#define pool_device_malloc(size)
#define pool_pinned_free(ptr)
#define pool_device_free(ptr)
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const uint64_t batch, QudaPrecision precision, QudaFieldLocation location)
Batch inversion the matrix field using an LU decomposition method.
void init()
Create the BLAS context.
void destroy()
Destroy the BLAS context.
void stop()
Stop profiling.
void start()
Start profiling.
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
FloatingPoint< float > Float
#define qudaMemcpy(dst, src, count, kind)
#define qudaDeviceSynchronize()
QudaVerbosity getVerbosity()