QUDA: quda/lib/targets/hip/blas_lapack_hipblas.cpp Source File

QUDA v1.1.0

A library for QCD on GPUs

[フレーム]

blas_lapack_hipblas.cpp

Go to the documentation of this file.

1 #include <blas_lapack.h>

2 #ifdef NATIVE_LAPACK_LIB

3 #include <cublas_v2.h>

4 #include <malloc_quda.h>

5 #endif

7 //#define _DEBUG

9 #ifdef _DEBUG

10 #include <eigen_helper.h>

11 #endif

13 namespace quda

14 {

16 namespace blas_lapack

17 {

19 namespace native

20 {

22 #ifdef NATIVE_LAPACK_LIB

23 static cublasHandle_t handle;

24 #endif

25 static bool cublas_init = false;

27 void init()

28 {

29 if (!cublas_init) {

30 #ifdef NATIVE_LAPACK_LIB

31 cublasStatus_t error = cublasCreate(&handle);

32 if (error != CUBLAS_STATUS_SUCCESS) errorQuda("cublasCreate failed with error %d", error);

33 cublas_init = true;

34 #endif

35 }

36 }

38 void destroy()

39 {

40 if (cublas_init) {

41 #ifdef NATIVE_LAPACK_LIB

42 cublasStatus_t error = cublasDestroy(handle);

43 if (error != CUBLAS_STATUS_SUCCESS)

44 errorQuda("\nError indestroying cublas context, error code = %d\n", error);

45 cublas_init = false;

46 #endif

47 }

48 }

50 #ifdef _DEBUG

51 template <typename EigenMatrix, typename Float>

52 __host__ void checkEigen(std::complex<Float> *A_h, std::complex<Float> *Ainv_h, int n, uint64_t batch)

53 {

54 EigenMatrix A = EigenMatrix::Zero(n, n);

55 EigenMatrix Ainv = EigenMatrix::Zero(n, n);

56 for (int j = 0; j < n; j++) {

57 for (int k = 0; k < n; k++) {

58 A(k, j) = A_h[batch * n * n + j * n + k];

59 Ainv(k, j) = Ainv_h[batch * n * n + j * n + k];

60 }

61 }

63 // Check result:

64 EigenMatrix unit = EigenMatrix::Identity(n, n);

65 EigenMatrix prod = A * Ainv;

66 Float L2norm = ((prod - unit).norm() / (n * n));

67 printfQuda("cuBLAS: Norm of (A * Ainv - I) batch %lu = %e\n", batch, L2norm);

68 }

69 #endif

71 // FIXME do this in pipelined fashion to reduce memory overhead.

72 long long BatchInvertMatrix(void *Ainv, void *A, const int n, const uint64_t batch, QudaPrecision prec,

73 QudaFieldLocation location)

74 {

75 #ifdef NATIVE_LAPACK_LIB

76 init();

77 if (getVerbosity() >= QUDA_VERBOSE)

78 printfQuda("BatchInvertMatrix (native - cuBLAS): Nc = %d, batch = %lu\n", n, batch);

80 long long flops = 0;

81 timeval start, stop;

82 gettimeofday(&start, NULL);

84 size_t size = 2 * n * n * prec * batch;

85 void *A_d = location == QUDA_CUDA_FIELD_LOCATION ? A : pool_device_malloc(size);

86 void *Ainv_d = location == QUDA_CUDA_FIELD_LOCATION ? Ainv : pool_device_malloc(size);

87 if (location == QUDA_CPU_FIELD_LOCATION) qudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);

89 #ifdef _DEBUG

90 // Debug code: Copy original A matrix to host

91 std::complex<float> *A_h

92 = (location == QUDA_CUDA_FIELD_LOCATION ? static_cast<std::complex<float> *>(pool_pinned_malloc(size)) :

93 static_cast<std::complex<float> *>(A_d));

94 if (location == QUDA_CUDA_FIELD_LOCATION) qudaMemcpy((void *)A_h, A_d, size, cudaMemcpyDeviceToHost);

95 #endif

97 int *dipiv = static_cast<int *>(pool_device_malloc(batch * n * sizeof(int)));

98 int *dinfo_array = static_cast<int *>(pool_device_malloc(batch * sizeof(int)));

99 int *info_array = static_cast<int *>(pool_pinned_malloc(batch * sizeof(int)));

100 memset(info_array, '0', batch * sizeof(int)); // silence memcheck warnings

101

102 if (prec == QUDA_SINGLE_PRECISION) {

103 typedef cuFloatComplex C;

104 C **A_array = static_cast<C **>(pool_device_malloc(2 * batch * sizeof(C *)));

105 C **Ainv_array = A_array + batch;

106 C **A_array_h = static_cast<C **>(pool_pinned_malloc(2 * batch * sizeof(C *)));

107 C **Ainv_array_h = A_array_h + batch;

108 for (uint64_t i = 0; i < batch; i++) {

109 A_array_h[i] = static_cast<C *>(A_d) + i * n * n;

110 Ainv_array_h[i] = static_cast<C *>(Ainv_d) + i * n * n;

111 }

112 qudaMemcpy(A_array, A_array_h, 2 * batch * sizeof(C *), cudaMemcpyHostToDevice);

113

114 cublasStatus_t error = cublasCgetrfBatched(handle, n, A_array, n, dipiv, dinfo_array, batch);

115 flops += batch * FLOPS_CGETRF(n, n);

116

117 if (error != CUBLAS_STATUS_SUCCESS)

118 errorQuda("\nError in LU decomposition (cublasCgetrfBatched), error code = %d\n", error);

119

120 qudaMemcpy(info_array, dinfo_array, batch * sizeof(int), cudaMemcpyDeviceToHost);

121 for (uint64_t i = 0; i < batch; i++) {

122 if (info_array[i] < 0) {

123 errorQuda("%lu argument had an illegal value or another error occured, such as memory allocation failed",

124 i);

125 } else if (info_array[i] > 0) {

126 errorQuda("%lu factorization completed but the factor U is exactly singular", i);

127 }

128 }

129

130 error = cublasCgetriBatched(handle, n, (const C **)A_array, n, dipiv, Ainv_array, n, dinfo_array, batch);

131 flops += batch * FLOPS_CGETRI(n);

132

133 if (error != CUBLAS_STATUS_SUCCESS)

134 errorQuda("\nError in matrix inversion (cublasCgetriBatched), error code = %d\n", error);

135

136 qudaMemcpy(info_array, dinfo_array, batch * sizeof(int), cudaMemcpyDeviceToHost);

137

138 for (uint64_t i = 0; i < batch; i++) {

139 if (info_array[i] < 0) {

140 errorQuda("%lu argument had an illegal value or another error occured, such as memory allocation failed",

141 i);

142 } else if (info_array[i] > 0) {

143 errorQuda("%lu factorization completed but the factor U is exactly singular", i);

144 }

145 }

146

147 pool_device_free(A_array);

148 pool_pinned_free(A_array_h);

149

150 #ifdef _DEBUG

151 // Debug code: Copy computed Ainv to host

152 std::complex<float> *Ainv_h = static_cast<std::complex<float> *>(pool_pinned_malloc(size));

153 qudaMemcpy((void *)Ainv_h, Ainv_d, size, cudaMemcpyDeviceToHost);

154

155 for (uint64_t i = 0; i < batch; i++) { checkEigen<MatrixXcf, float>(A_h, Ainv_h, n, i); }

156 pool_pinned_free(Ainv_h);

157 pool_pinned_free(A_h);

158 #endif

159 } else {

160 errorQuda("%s not implemented for precision=%d", __func__, prec);

161 }

162

163 if (location == QUDA_CPU_FIELD_LOCATION) {

164 qudaMemcpy(Ainv, Ainv_d, size, cudaMemcpyDeviceToHost);

165 pool_device_free(Ainv_d);

166 pool_device_free(A_d);

167 }

168

169 pool_device_free(dipiv);

170 pool_device_free(dinfo_array);

171 pool_pinned_free(info_array);

172

173 qudaDeviceSynchronize();

174 gettimeofday(&stop, NULL);

175 long ds = stop.tv_sec - start.tv_sec;

176 long dus = stop.tv_usec - start.tv_usec;

177 double time = ds + 0.000001 * dus;

178

179 if (getVerbosity() >= QUDA_VERBOSE)

180 printfQuda("Batched matrix inversion completed in %f seconds with GFLOPS = %f\n", time, 1e-9 * flops / time);

181

182 return flops;

183 #else

184 errorQuda("Native BLAS not built. Please build and use native BLAS or use generic BLAS");

185 return 0; // Stops a compiler warning

186 #endif

187 }

188

189 } // namespace native

190 } // namespace blas_lapack

191 } // namespace quda

blas_lapack.h

FLOPS_CGETRF

#define FLOPS_CGETRF(m_, n_)

Definition: blas_lapack.h:14

FLOPS_CGETRI

#define FLOPS_CGETRI(n_)

Definition: blas_lapack.h:21

prec

QudaPrecision prec

Definition: command_line_params.cpp:26

memset

void * memset(void *s, int c, size_t n)

eigen_helper.h

QudaPrecision

enum QudaPrecision_s QudaPrecision

QUDA_CUDA_FIELD_LOCATION

@ QUDA_CUDA_FIELD_LOCATION

Definition: enum_quda.h:326

QUDA_CPU_FIELD_LOCATION

@ QUDA_CPU_FIELD_LOCATION

Definition: enum_quda.h:325

QUDA_VERBOSE

@ QUDA_VERBOSE

Definition: enum_quda.h:267

QudaFieldLocation

enum QudaFieldLocation_s QudaFieldLocation

QUDA_SINGLE_PRECISION

@ QUDA_SINGLE_PRECISION

Definition: enum_quda.h:64

malloc_quda.h

pool_pinned_malloc

#define pool_pinned_malloc(size)

Definition: malloc_quda.h:172

pool_device_malloc

#define pool_device_malloc(size)

Definition: malloc_quda.h:170

pool_pinned_free

#define pool_pinned_free(ptr)

Definition: malloc_quda.h:173

pool_device_free

#define pool_device_free(ptr)

Definition: malloc_quda.h:171

quda::blas_lapack::native::BatchInvertMatrix

long long BatchInvertMatrix(void *Ainv, void *A, const int n, const uint64_t batch, QudaPrecision precision, QudaFieldLocation location)

Batch inversion the matrix field using an LU decomposition method.

Definition: blas_lapack_cublas.cpp:76

quda::blas_lapack::native::init

void init()

Create the BLAS context.

Definition: blas_lapack_cublas.cpp:28

quda::blas_lapack::native::destroy

void destroy()

Destroy the BLAS context.

Definition: blas_lapack_cublas.cpp:42

quda::blas::flops

unsigned long long flops

quda::device::profile::stop

void stop()

Stop profiling.

Definition: device.cpp:228

quda::device::profile::start

void start()

Start profiling.

Definition: device.cpp:226

quda

Definition: blas_lapack.h:24

quda::norm

__host__ __device__ ValueType norm(const complex< ValueType > &z)

Returns the magnitude of z squared.

Definition: complex_quda.h:1088

testing::internal::Float

FloatingPoint< float > Float

Definition: gtest-internal.h:396

qudaMemcpy

#define qudaMemcpy(dst, src, count, kind)

Definition: quda_api.h:204

qudaDeviceSynchronize

#define qudaDeviceSynchronize()

Definition: quda_api.h:250

printfQuda

#define printfQuda(...)

Definition: util_quda.h:114

getVerbosity

QudaVerbosity getVerbosity()

Definition: util_quda.cpp:21

errorQuda

#define errorQuda(...)

Definition: util_quda.h:120

Generated on Thu Oct 28 2021 16:10:27 for QUDA by doxygen 1.9.1