QUDA: quda/tests/host_reference/hisq_force_reference2.cpp Source File

QUDA v1.1.0

A library for QCD on GPUs

[フレーム]

hisq_force_reference2.cpp

Go to the documentation of this file.

1 #include <cstdio>

2 #include <cstdlib>

3 #include <iostream>

4 #include <iomanip>

5 #include <complex>

7 #include <quda.h>

8 #include <gauge_field.h>

10 using namespace quda;

11 //namespace quda {

13 #define RETURN_IF_ERR if(err) return;

15 extern int gauge_order;

16 extern int Vh;

17 extern int Vh_ex;

19 static int OPP_DIR(int dir){ return 7-dir; }

20 static bool GOES_FORWARDS(int dir){ return (dir<=3); }

22 template<int N>

23 struct Sign{

24 static const int result = 1;

25 };

27 template<>

28 struct Sign<1>

29 {

30 static const int result = -1;

31 };

34 template<class T, class U>

35 struct Promote

36 {

37 typedef T Type;

38 };

40 template<>

41 struct Promote<int,float>

42 {

43 typedef float Type;

44 };

46 template<>

47 struct Promote<float,int>

48 {

49 typedef float Type;

50 };

52 template<>

53 struct Promote<int, double>

54 {

55 typedef double Type;

56 };

58 template<>

59 struct Promote<double, int>

60 {

61 typedef double Type;

62 };

64 template<>

65 struct Promote<float, double>

66 {

67 typedef double Type;

68 };

70 template<>

71 struct Promote<double, float>

72 {

73 typedef double Type;

74 };

76 template<>

77 struct Promote< int, std::complex<float> >

78 {

79 typedef std::complex<float> Type;

80 };

82 template<>

83 struct Promote< std::complex<float>, int >

84 {

85 typedef std::complex<float> Type;

86 };

88 template<>

89 struct Promote< float, std::complex<float> >

90 {

91 typedef std::complex<float> Type;

92 };

94 template<>

95 struct Promote< int, std::complex<double> >

96 {

97 typedef std::complex<double> Type;

98 };

100 template<>

101 struct Promote< std::complex<double>, int >

102 {

103 typedef std::complex<double> Type;

104 };

105

106 template<>

107 struct Promote< float, std::complex<double> >

108 {

109 typedef std::complex<double> Type;

110 };

111

112 template<>

113 struct Promote< std::complex<double>, float >

114 {

115 typedef std::complex<double> Type;

116 };

117

118 template<>

119 struct Promote< double, std::complex<double> >

120 {

121 typedef std::complex<double> Type;

122 };

123

124 template<>

125 struct Promote< std::complex<double>, double >

126 {

127 typedef std::complex<double> Type;

128 };

129

130 template<int N, class T>

131 class Matrix

132 {

133 private:

134 T data[N][N];

135

136 public:

137 Matrix(); // default constructor

138 Matrix(const Matrix<N,T>& mat); // copy constructor

139 Matrix & operator += (const Matrix<N,T>& mat);

140 Matrix & operator -= (const Matrix<N,T>& mat);

141 const T& operator()(int i, int j) const;

142 T& operator()(int i, int j);

143 };

144

145 template<int N, class T>

146 Matrix<N,T>::Matrix()

147 {

148 for(int i=0; i<N; ++i){

149 for(int j=0; j<N; ++j){

150 data[i][j] = static_cast<T>(0);

151 }

152 }

153 }

154

155 template<int N, class T>

156 Matrix<N,T>::Matrix(const Matrix<N,T>& mat)

157 {

158 for(int i=0; i<N; ++i){

159 for(int j=0; j<N; ++j){

160 data[i][j] = mat.data[i][j];

161 }

162 }

163 }

164

165 template<int N, class T>

166 T& Matrix<N,T>::operator()(int i, int j)

167 {

168 return data[i][j];

169 }

170

171 template<int N, class T>

172 const T& Matrix<N,T>::operator()(int i, int j) const

173 {

174 return data[i][j];

175 }

176

177 template<int N, class T>

178 Matrix<N,T> & Matrix<N,T>::operator+=(const Matrix<N,T>& mat)

179 {

180 for(int i=0; i<N; ++i){

181 for(int j=0; j<N; ++j){

182 data[i][j] += mat.data[i][j];

183 }

184 }

185 return *this;

186 }

187

188 template<int N, class T>

189 Matrix<N,T> & Matrix<N,T>::operator-=(const Matrix<N,T>& mat)

190 {

191 for(int i=0; i<N; ++i){

192 for(int j=0; j<N; ++j){

193 data[i][j] -= mat.data[i][j];

194 }

195 }

196 return *this;

197 }

198

199 template<int N, class T>

200 Matrix<N,T> operator+(const Matrix<N,T>& a, const Matrix<N,T>& b)

201 {

202 Matrix<N,T> result(a);

203 result += b;

204 return result;

205 }

206

207 template<int N, class T>

208 Matrix<N,T> operator-(const Matrix<N,T>& a, const Matrix<N,T>& b)

209 {

210 Matrix<N,T> result(a);

211 result -= b;

212 return result;

213 }

214

215 template<int N, class T>

216 Matrix<N,T> operator*(const Matrix<N,T>& a, const Matrix<N,T>& b)

217 {

218 Matrix<N,T> result;

219 for(int i=0; i<N; ++i){

220 for(int j=0; j<N; ++j){

221 result(i,j) = static_cast<T>(0);

222 for(int k=0; k<N; ++k){

223 result(i,j) += a(i,k)*b(k,j);

224 }

225 }

226 }

227 return result;

228 }

229

230 template<int N, class T>

231 Matrix<N,std::complex<T> > conj(const Matrix<N,std::complex<T> >& mat)

232 {

233 Matrix<N,std::complex<T> > result;

234 for(int i=0; i<N; ++i){

235 for(int j=0; j<N; ++j){

236 result(i,j) = std::conj(mat(j,i));

237 }

238 }

239 return result;

240 }

241

242 template<int N, class T>

243 Matrix<N,T> transpose(const Matrix<N,std::complex<T> >& mat)

244 {

245 Matrix<N,T> result;

246 for(int i=0; i<N; ++i){

247 for(int j=0; j<N; ++j){

248 result(i,j) = mat(j,i);

249 }

250 }

251 return result;

252 }

253

254 template<int N, class T, class U>

255 Matrix<N, typename Promote<T,U>::Type> operator*(const Matrix<N,T>& mat, const U& scalar)

256 {

257 typedef typename Promote<T,U>::Type return_type;

258 Matrix<N,return_type> result;

259

260 for(int i=0; i<N; ++i){

261 for(int j=0; j<N; ++j){

262 result(i,j) = scalar*mat(i,j);

263 }

264 }

265 return result;

266 }

267

268 template<int N, class T, class U>

269 Matrix<N, typename Promote<T,U>::Type> operator*(const U& scalar, const Matrix<N,T>& mat)

270 {

271 return mat*scalar;

272 }

273

274 template<int N, class T>

275 struct Identity

276 {

277 Matrix<N,T> operator()() const {

278 Matrix<N,T> id;

279 for(int i=0; i<N; ++i){

280 id(i,i) = static_cast<T>(1);

281 }

282 return id;

283 } // operator()

284 };

285

286 template<int N, class T>

287 struct Zero

288 {

289 // the default constructor zeros all matrix elements

290 Matrix<N,T> operator()() const {

291 return Matrix<N,T>();

292 }

293 };

294

295

296 template<int N, class T>

297 std::ostream & operator << (std::ostream & os, const Matrix<N,T> & m)

298 {

299 for(int i=0; i<N; ++i){

300 for(int j=0; j<N; ++j){

301 os << m(i,j) << " ";

302 }

303 if(i<N-1) os << std::endl;

304 }

305 return os;

306 }

307

308

309

310 template<class Real>

311 class LoadStore{

312 private:

313 const int volume;

314 const int half_volume;

315 public:

316 LoadStore(int vol) : volume(vol), half_volume(vol/2) {}

317

318 void loadMatrixFromField(const Real* const field, int oddBit, int half_lattice_index, Matrix<3, std::complex<Real> >* const mat) const;

319

320 void loadMatrixFromField(const Real* const field, int oddBit, int dir, int half_lattice_index, Matrix<3, std::complex<Real> >* const mat) const;

321

322 void storeMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int half_lattice_index, Real* const field) const;

323

324 void addMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int half_lattice_index, Real coeff, Real* const) const;

325

326 void addMatrixToField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real* const) const;

327

328 void storeMatrixToMomentumField(const Matrix<3, std::complex<Real> >& mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real* const) const;

329 Real getData(const Real* const field, int idx, int dir, int oddBit, int offset, int hfv) const;

330 void addData(Real* const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const;

331 int half_idx_conversion_ex2normal(int half_lattice_index, const int* dim, int oddBit) const ;

332 int half_idx_conversion_normal2ex(int half_lattice_index, const int* dim, int oddBit) const ;

333 };

334

335 template<class Real>

336 int LoadStore<Real>::half_idx_conversion_ex2normal(int half_lattice_index_ex, const int* dim, int oddBit) const

337 {

338 int X1=dim[0];

339 int X2=dim[1];

340 int X3=dim[2];

341 //int X4=dim[3];

342

343 int E1=dim[0]+4;

344 int E2=dim[1]+4;

345 int E3=dim[2]+4;

346 //int E4=dim[3]+4;

347 int E1h=E1/2;

348

349 int sid = half_lattice_index_ex;

350

351 int za = sid/E1h;

352 int x1h = sid - za*E1h;

353 int zb = za/E2;

354 int x2 = za - zb*E2;

355 int x4 = zb/E3;

356 int x3 = zb - x4*E3;

357 int x1odd = (x2 + x3 + x4 + oddBit) & 1;

358 int x1 = 2*x1h + x1odd;

359

360 int idx = ((x4-2)*X3*X2*X1 + (x3-2)*X2*X1+(x2-2)*X1+(x1-2))/2;

361 return idx;

362 }

363

364 template<class Real>

365 int LoadStore<Real>::half_idx_conversion_normal2ex(int half_lattice_index, const int* dim, int oddBit) const

366 {

367 int X1=dim[0];

368 int X2=dim[1];

369 int X3=dim[2];

370 //int X4=dim[3];

371 int X1h=X1/2;

372

373 int E1=dim[0]+4;

374 int E2=dim[1]+4;

375 int E3=dim[2]+4;

376 //int E4=dim[3]+4;

377 //int E1h=E1/2;

378

379 int sid = half_lattice_index;

380

381 int za = sid/X1h;

382 int x1h = sid - za*X1h;

383 int zb = za/X2;

384 int x2 = za - zb*X2;

385 int x4 = zb/X3;

386 int x3 = zb - x4*X3;

387 int x1odd = (x2 + x3 + x4 + oddBit) & 1;

388 int x1 = 2*x1h + x1odd;

389

390 int idx = ((x4+2)*E3*E2*E1 + (x3+2)*E2*E1+(x2+2)*E1+(x1+2))/2;

391

392 return idx;

393 }

394

395 template<class Real>

396 Real LoadStore<Real>::getData(const Real* const field, int idx, int dir, int oddBit, int offset, int hfv) const

397 {

398 if(gauge_order == QUDA_MILC_GAUGE_ORDER){

399 return field[(4*hfv*oddBit +4*idx + dir)*18+offset];

400 }else{ //QDP format

401 return ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset];

402 }

403 }

404 template<class Real>

405 void LoadStore<Real>::addData(Real* const field, int idx, int dir, int oddBit, int offset, Real v, int hfv) const

406 {

407 if(gauge_order == QUDA_MILC_GAUGE_ORDER){

408 field[(4*hfv*oddBit +4*idx + dir)*18+offset] += v;

409 }else{ //QDP format

410 ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset] += v;

411 }

412 }

413

414

415

416 template<class Real>

417 void LoadStore<Real>::loadMatrixFromField(const Real* const field,

418 int oddBit,

419 int half_lattice_index,

420 Matrix<3, std::complex<Real> >* const mat

421 ) const

422 {

423 #ifdef MULTI_GPU

424 int hfv = Vh_ex;

425 #else

426 int hfv = Vh;

427 #endif

428

429 int offset = 0;

430 for(int i=0; i<3; ++i){

431 for(int j=0; j<3; ++j){

432 (*mat)(i,j) = (*(field + (oddBit*hfv + half_lattice_index)*18 + offset++));

433 (*mat)(i,j) += std::complex<Real>(0, *(field + (oddBit*hfv + half_lattice_index)*18 + offset++));

434 }

435 }

436 return;

437 }

438

439 template<class Real>

440 void LoadStore<Real>::loadMatrixFromField(const Real* const field,

441 int oddBit,

442 int dir,

443 int half_lattice_index,

444 Matrix<3, std::complex<Real> >* const mat

445 ) const

446 {

447 #ifdef MULTI_GPU

448 int hfv = Vh_ex;

449 #else

450 int hfv = Vh;

451 #endif

452

453 //const Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;

454 int offset = 0;

455 for(int i=0; i<3; ++i){

456 for(int j=0; j<3; ++j){

457 (*mat)(i,j) = (getData(field, half_lattice_index, dir, oddBit, offset++, hfv));

458 (*mat)(i,j) += std::complex<Real>(0, getData(field, half_lattice_index, dir, oddBit, offset++, hfv));

459 }

460 }

461 return;

462 }

463

464 template<class Real>

465 void LoadStore<Real>::storeMatrixToField(const Matrix<3, std::complex<Real> >& mat,

466 int oddBit,

467 int half_lattice_index,

468 Real* const field) const

469 {

470 #ifdef MULTI_GPU

471 int hfv = Vh_ex;

472 #else

473 int hfv = Vh;

474 #endif

475

476 int offset = 0;

477 for(int i=0; i<3; ++i){

478 for(int j=0; j<3; ++j){

479 *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (mat)(i,j).real();

480 *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (mat)(i,j).imag();

481 }

482 }

483 return;

484 }

485

486 template<class Real>

487 void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real> >& mat,

488 int oddBit,

489 int half_lattice_index,

490 Real coeff,

491 Real* const field) const

492 {

493 #ifdef MULTI_GPU

494 int hfv = Vh_ex;

495 #else

496 int hfv = Vh;

497 #endif

498 Real* const local_field = field + (oddBit*hfv + half_lattice_index)*18;

499

500

501 int offset = 0;

502 for(int i=0; i<3; ++i){

503 for(int j=0; j<3; ++j){

504 local_field[offset++] += coeff*mat(i,j).real();

505 local_field[offset++] += coeff*mat(i,j).imag();

506 }

507 }

508 return;

509 }

510

511

512 template<class Real>

513 void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real> >& mat,

514 int oddBit,

515 int dir,

516 int half_lattice_index,

517 Real coeff,

518 Real* const field) const

519 {

520

521 #ifdef MULTI_GPU

522 int hfv = Vh_ex;

523 #else

524 int hfv = Vh;

525 #endif

526

527 //Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;

528 int offset = 0;

529 for(int i=0; i<3; ++i){

530 for(int j=0; j<3; ++j){

531 //local_field[offset++] += coeff*mat(i,j).real();

532 addData(field, half_lattice_index, dir, oddBit, offset++, coeff*mat(i,j).real(), hfv);

533

534 //local_field[offset++] += coeff*mat(i,j).imag();

535 addData(field, half_lattice_index, dir, oddBit, offset++, coeff*mat(i,j).imag(), hfv);

536 }

537 }

538 return;

539 }

540

541

542 template<class Real>

543 void LoadStore<Real>::storeMatrixToMomentumField(const Matrix<3, std::complex<Real> >& mat,

544 int oddBit,

545 int dir,

546 int half_lattice_index,

547 Real coeff,

548 Real* const field) const

549 {

550 Real* const mom_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*10;

551 mom_field[0] = (mat(0,1).real() - mat(1,0).real())*0.5*coeff;

552 mom_field[1] = (mat(0,1).imag() + mat(1,0).imag())*0.5*coeff;

553

554 mom_field[2] = (mat(0,2).real() - mat(2,0).real())*0.5*coeff;

555 mom_field[3] = (mat(0,2).imag() + mat(2,0).imag())*0.5*coeff;

556

557 mom_field[4] = (mat(1,2).real() - mat(2,1).real())*0.5*coeff;

558 mom_field[5] = (mat(1,2).imag() + mat(2,1).imag())*0.5*coeff;

559

560 const Real temp = (mat(0,0).imag() + mat(1,1).imag() + mat(2,2).imag())*0.3333333333333333333;

561 mom_field[6] = (mat(0,0).imag() - temp)*coeff;

562 mom_field[7] = (mat(1,1).imag() - temp)*coeff;

563 mom_field[8] = (mat(2,2).imag() - temp)*coeff;

564 mom_field[9] = 0.0;

565 return;

566 }

567

568

569

570 template<int oddBit>

571 struct Locator

572 {

573 private:

574 int local_dim[4];

575 int volume;

576 int half_index;

577 int full_index;

578 int full_coord[4];

579 void getCoordsFromHalfIndex(int half_index, int coord[4]);

580 void getCoordsFromFullIndex(int full_index, int coord[4]);

581 void cache(int half_lattice_index); // caches the half-lattice index, full-lattice index,

582 // and full-lattice coordinates

583

584

585 public:

586 Locator(const int dim[4]);

587 int getFullFromHalfIndex(int half_lattice_index);

588 int getNeighborFromFullIndex(int full_lattice_index, int dir, int* err=NULL);

589 };

590

591 template<int oddBit>

592 Locator<oddBit>::Locator(const int dim[4]) : half_index(-1), full_index(-1){

593 volume = 1;

594 for(int dir=0; dir<4; ++dir){

595 local_dim[dir] = dim[dir];

596 volume *= local_dim[dir];

597 }

598 }

599

600 // Store the half_lattice index, works out and stores the full lattice index

601 // and the coordinates.

602 template<int oddBit>

603 void Locator<oddBit>::getCoordsFromHalfIndex(int half_lattice_index, int coord[4])

604 {

605 #ifdef MULTI_GPU

606 int E1 = local_dim[0]+4;

607 int E2 = local_dim[1]+4;

608 int E3 = local_dim[2]+4;

609 //int E4 = local_dim[3]+4;

610 int E1h = E1/2;

611

612 int z1 = half_lattice_index/E1h;

613 int x1h = half_lattice_index - z1*E1h;

614 int z2 = z1/E2;

615 coord[1] = z1 - z2*E2;

616 coord[3] = z2/E3;

617 coord[2] = z2 - coord[3]*E3;

618 int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;

619 coord[0] = 2*x1h + x1odd;

620 #else

621 int half_dim_0 = local_dim[0]/2;

622 int z1 = half_lattice_index/half_dim_0;

623 int x1h = half_lattice_index - z1*half_dim_0;

624 int z2 = z1/local_dim[1];

625 coord[1] = z1 - z2*local_dim[1];

626 coord[3] = z2/local_dim[2];

627 coord[2] = z2 - coord[3]*local_dim[2];

628 int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;

629 coord[0] = 2*x1h + x1odd;

630 #endif

631

632 }

633

634 template<int oddBit>

635 void Locator<oddBit>::getCoordsFromFullIndex(int full_lattice_index, int coord[4])

636 {

637 #ifdef MULTI_GPU

638 int D1=local_dim[0]+4;

639 int D2=local_dim[1]+4;

640 int D3=local_dim[2]+4;

641 //int D4=local_dim[3]+4;

642 //int D1h=D1/2;

643 #else

644 int D1=local_dim[0];

645 int D2=local_dim[1];

646 int D3=local_dim[2];

647 //int D4=local_dim[3];

648 //int D1h=D1/2;

649 #endif

650

651 int z1 = full_lattice_index/D1;

652 coord[0] = full_lattice_index - z1*D1;

653 int z2 = z1/D2;

654 coord[1] = z1 - z2*D2;

655 coord[3] = z2/D3;

656 coord[2] = z2 - coord[3]*D3;

657 }

658

659

660

661

662

663 template<int oddBit>

664 void Locator<oddBit>::cache(int half_lattice_index)

665 {

666 half_index = half_lattice_index;

667 getCoordsFromHalfIndex(half_lattice_index, full_coord);

668 int x1odd = (full_coord[1] + full_coord[2] + full_coord[3] + oddBit) & 1;

669 full_index = 2*half_lattice_index + x1odd;

670 return;

671 }

672

673 template<int oddBit>

674 int Locator<oddBit>::getFullFromHalfIndex(int half_lattice_index)

675 {

676 if(half_index != half_lattice_index) cache(half_lattice_index);

677 return full_index;

678 }

679

680 // From full index return the neighbouring full index

681 template<int oddBit>

682 int Locator<oddBit>::getNeighborFromFullIndex(int full_lattice_index, int dir, int* err)

683 {

684 if(err) *err = 0;

685

686 int coord[4];

687 int neighbor_index;

688 getCoordsFromFullIndex(full_lattice_index, coord);

689 #ifdef MULTI_GPU

690 int E1 = local_dim[0] + 4;

691 int E2 = local_dim[1] + 4;

692 int E3 = local_dim[2] + 4;

693 int E4 = local_dim[3] + 4;

694 switch(dir){

695 case 0: //+X

696 neighbor_index = full_lattice_index + 1;

697 if(err && (coord[0] == E1-1) ) *err = 1;

698 break;

699 case 1: //+Y

700 neighbor_index = full_lattice_index + E1;

701 if(err && (coord[1] == E2-1) ) *err = 1;

702 break;

703 case 2: //+Z

704 neighbor_index = full_lattice_index + E2*E1;

705 if(err && (coord[2] == E3-1) ) *err = 1;

706 break;

707 case 3: //+T

708 neighbor_index = full_lattice_index + E3*E2*E1;

709 if(err && (coord[3] == E4-1) ) *err = 1;

710 break;

711 case 7: //-X

712 neighbor_index = full_lattice_index - 1;

713 if(err && (coord[0] == 0) ) *err = 1;

714 break;

715 case 6: //-Y

716 neighbor_index = full_lattice_index - E1;

717 if(err && (coord[1] == 0) ) *err = 1;

718 break;

719 case 5: //-Z

720 neighbor_index = full_lattice_index - E2*E1;

721 if(err && (coord[2] == 0) ) *err = 1;

722 break;

723 case 4: //-T

724 neighbor_index = full_lattice_index - E3*E2*E1;

725 if(err && (coord[3] == 0) ) *err = 1;

726 break;

727 default:

728 errorQuda("Neighbor index could not be determined\n");

729 exit(1);

730 break;

731 } // switch(dir)

732

733 #else

734 switch(dir){

735 case 0:

736 neighbor_index = (coord[0] == local_dim[0]-1) ? full_lattice_index + 1 - local_dim[0] : full_lattice_index + 1;

737 break;

738 case 1:

739 neighbor_index = (coord[1] == local_dim[1]-1) ? full_lattice_index + local_dim[0]*(1 - local_dim[1]) : full_lattice_index + local_dim[0];

740 break;

741 case 2:

742 neighbor_index = (coord[2] == local_dim[2]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index + local_dim[0]*local_dim[1];

743 break;

744 case 3:

745 neighbor_index = (coord[3] == local_dim[3]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2]*(1-local_dim[3]) : full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2];

746 break;

747 case 7:

748 neighbor_index = (coord[0] == 0) ? full_lattice_index - 1 + local_dim[0] : full_lattice_index - 1;

749 break;

750 case 6:

751 neighbor_index = (coord[1] == 0) ? full_lattice_index - local_dim[0]*(1 - local_dim[1]) : full_lattice_index - local_dim[0];

752 break;

753 case 5:

754 neighbor_index = (coord[2] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index - local_dim[0]*local_dim[1];

755 break;

756 case 4:

757 neighbor_index = (coord[3] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2]*(1 - local_dim[3]) : full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2];

758 break;

759 default:

760 errorQuda("Neighbor index could not be determined\n");

761 exit(1);

762 break;

763 } // switch(dir)

764 if(err) *err = 0;

765 #endif

766 return neighbor_index;

767 }

768

769 // Can't typedef a template

770 template<class Real>

771 struct ColorMatrix

772 {

773 typedef Matrix<3, std::complex<Real> > Type;

774 };

775

776 template<class Real, int oddBit>

777 void computeOneLinkSite(const int dim[4],

778 int half_lattice_index,

779 const Real* const oprod,

780 int sig, Real coeff,

781 const LoadStore<Real>& ls,

782 Real* const output)

783 {

784 if( GOES_FORWARDS(sig) ){

785 typename ColorMatrix<Real>::Type colorMatW;

786 #ifdef MULTI_GPU

787 int idx = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);

788 #else

789 int idx = half_lattice_index;

790 #endif

791 ls.loadMatrixFromField(oprod, oddBit, sig, idx, &colorMatW);

792 ls.addMatrixToField(colorMatW, oddBit, sig, idx, coeff, output);

793 }

794 return;

795 }

796

797 template<class Real>

798 void computeOneLinkField(const int dim[4],

799 const Real* const oprod,

800 int sig, Real coeff,

801 Real* const output)

802 {

803 int volume = 1;

804 for(int dir=0; dir<4; ++dir) volume *= dim[dir];

805 const int half_volume = volume/2;

806 LoadStore<Real> ls(volume);

807 for(int site=0; site<half_volume; ++site){

808 computeOneLinkSite<Real,0>(dim, site,

809 oprod,

810 sig, coeff, ls,

811 output);

812

813 }

814 // Loop over odd lattice sites

815 for(int site=0; site<half_volume; ++site){

816 computeOneLinkSite<Real,1>(dim, site,

817 oprod,

818 sig, coeff, ls,

819 output);

820 }

821 return;

822 }

823

824

825

826

827

828

829

830

831

832 // middleLinkKernel compiles for now, but lots of debugging to be done

833 template<class Real, int oddBit>

834 void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.

835 const int dim[4],

836 const Real* const oprod,

837 const Real* const Qprev,

838 const Real* const link,

839 int sig, int mu,

840 Real coeff,

841 const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields

842 Real* const Pmu,

843 Real* const P3,

844 Real* const Qmu,

845 Real* const newOprod

846 )

847 {

848 const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;

849 const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;

850

851

852 Locator<oddBit> locator(dim);

853 int point_b, point_c, point_d;

854 int ad_link_nbr_idx, ab_link_nbr_idx, bc_link_nbr_idx;

855 int X = locator.getFullFromHalfIndex(half_lattice_index);

856

857 int err;

858 int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;

859 point_d = new_mem_idx >> 1;

860 // getNeighborFromFullIndex will work on any site on the lattice, odd or even

861 new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx,sig, &err); RETURN_IF_ERR;

862 point_c = new_mem_idx >> 1;

863

864 new_mem_idx = locator.getNeighborFromFullIndex(X,sig); RETURN_IF_ERR;

865 point_b = new_mem_idx >> 1;

866

867 ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;

868 bc_link_nbr_idx = (mu_positive) ? point_c : point_b;

869 ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;

870

871 typename ColorMatrix<Real>::Type ab_link, bc_link, ad_link;

872 typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY, colorMatZ;

873

874

875

876

877

878 if(sig_positive){

879 ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);

880 }else{

881 ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);

882 }

883

884 if(mu_positive){

885 ls.loadMatrixFromField(link, oddBit, mu, bc_link_nbr_idx, &bc_link);

886 }else{

887 ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(mu), bc_link_nbr_idx, &bc_link);

888 }

889

890 if(Qprev == NULL){

891 if(sig_positive){

892 ls.loadMatrixFromField(oprod, 1-oddBit, sig, point_d, &colorMatY);

893 }else{

894 ls.loadMatrixFromField(oprod, oddBit, OPP_DIR(sig), point_c, &colorMatY);

895 colorMatY = conj(colorMatY);

896 }

897 }else{ // Qprev != NULL

898 ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);

899 }

900

901 colorMatW = (!mu_positive) ? bc_link*colorMatY : conj(bc_link)*colorMatY;

902 if(Pmu) ls.storeMatrixToField(colorMatW, 1-oddBit, point_b, Pmu);

903

904 colorMatY = (sig_positive) ? ab_link*colorMatW : conj(ab_link)*colorMatW;

905 ls.storeMatrixToField(colorMatY, oddBit, half_lattice_index, P3);

906

907 if(mu_positive){

908 ls.loadMatrixFromField(link, 1-oddBit, mu, ad_link_nbr_idx, &ad_link);

909 }else{

910 ls.loadMatrixFromField(link, oddBit, OPP_DIR(mu), ad_link_nbr_idx, &ad_link);

911 ad_link = conj(ad_link);

912 }

913

914

915 if(Qprev == NULL){

916 if(sig_positive) colorMatY = colorMatW*ad_link;

917 if(Qmu) ls.storeMatrixToField(ad_link, oddBit, half_lattice_index, Qmu);

918 }else{ // Qprev != NULL

919 if(Qmu || sig_positive){

920 ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatY);

921 colorMatX= colorMatY*ad_link;

922 }

923 if(Qmu) ls.storeMatrixToField(colorMatX, oddBit, half_lattice_index, Qmu);

924 if(sig_positive) colorMatY = colorMatW*colorMatX;

925 }

926

927 if(sig_positive) ls.addMatrixToField(colorMatY, oddBit, sig, half_lattice_index, coeff, newOprod);

928

929 return;

930 } // computeMiddleLinkSite

931

932

933 template<class Real>

934 void computeMiddleLinkField(const int dim[4],

935 const Real* const oprod,

936 const Real* const Qprev,

937 const Real* const link,

938 int sig, int mu,

939 Real coeff,

940 Real* const Pmu,

941 Real* const P3,

942 Real* const Qmu,

943 Real* const newOprod

944 )

945 {

946

947 int volume = 1;

948 for(int dir=0; dir<4; ++dir) volume *= dim[dir];

949 #ifdef MULTI_GPU

950 const int loop_count = Vh_ex;

951 #else

952 const int loop_count = volume/2;

953 #endif

954 // loop over the lattice volume

955 // To keep the code as close to the GPU code as possible, we'll

956 // loop over the even sites first and then the odd sites

957 LoadStore<Real> ls(volume);

958 for(int site=0; site<loop_count; ++site){

959 computeMiddleLinkSite<Real, 0>(site, dim,

960 oprod, Qprev, link,

961 sig, mu, coeff,

962 ls,

963 Pmu, P3, Qmu, newOprod);

964 }

965 // Loop over odd lattice sites

966 for(int site=0; site<loop_count; ++site){

967 computeMiddleLinkSite<Real,1>(site, dim,

968 oprod, Qprev, link,

969 sig, mu, coeff,

970 ls,

971 Pmu, P3, Qmu, newOprod);

972 }

973 return;

974 }

975

976

977

978

979 template<class Real, int oddBit>

980 void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.

981 const int dim[4],

982 const Real* const P3,

983 const Real* const Qprod, // why?

984 const Real* const link,

985 int sig, int mu,

986 Real coeff, Real accumu_coeff,

987 const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields

988 Real* const shortP,

989 Real* const newOprod

990 )

991 {

992

993 const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;

994 const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;

995

996 Locator<oddBit> locator(dim);

997 int point_d;

998 int ad_link_nbr_idx;

999 int X = locator.getFullFromHalfIndex(half_lattice_index);

1000

1001 int err;

1002 int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;

1003 point_d = new_mem_idx >> 1;

1004 ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;

1005

1006

1007 typename ColorMatrix<Real>::Type ad_link;

1008 typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY;

1009

1010 ls.loadMatrixFromField(P3, oddBit, half_lattice_index, &colorMatY);

1011

1012 if(shortP){

1013 if(mu_positive){

1014 ad_link_nbr_idx = point_d;

1015 ls.loadMatrixFromField(link, 1-oddBit, mu, ad_link_nbr_idx, &ad_link);

1016 }else{

1017 ad_link_nbr_idx = half_lattice_index;

1018 ls.loadMatrixFromField(link, oddBit, OPP_DIR(mu), ad_link_nbr_idx, &ad_link);

1019 }

1020 colorMatW = (mu_positive) ? ad_link*colorMatY : conj(ad_link)*colorMatY;

1021 ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP);

1022 } // if(shortP)

1023

1024

1025 Real mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;

1026

1027 if(Qprod){

1028 ls.loadMatrixFromField(Qprod, 1-oddBit, point_d, &colorMatX);

1029 if(mu_positive){

1030 colorMatW = colorMatY*colorMatX;

1031 if(!oddBit){ mycoeff = -mycoeff; }

1032 ls.addMatrixToField(colorMatW, 1-oddBit, mu, point_d, mycoeff, newOprod);

1033 }else{

1034 colorMatW = conj(colorMatX)*conj(colorMatY);

1035 if(oddBit){ mycoeff = -mycoeff; }

1036 ls.addMatrixToField(colorMatW, oddBit, OPP_DIR(mu), half_lattice_index, mycoeff, newOprod);

1037 }

1038 }

1039

1040 if(!Qprod){

1041 if(mu_positive){

1042 if(!oddBit){ mycoeff = -mycoeff; }

1043 ls.addMatrixToField(colorMatY, 1-oddBit, mu, point_d, mycoeff, newOprod);

1044 }else{

1045 if(oddBit){ mycoeff = -mycoeff; }

1046 colorMatW = conj(colorMatY);

1047 ls.addMatrixToField(colorMatW, oddBit, OPP_DIR(mu), half_lattice_index, mycoeff, newOprod);

1048 }

1049 } // if !(Qprod)

1050

1051 return;

1052 } // computeSideLinkSite

1053

1054

1055 // Maybe change to computeSideLinkField

1056 template<class Real>

1057 void computeSideLinkField(const int dim[4],

1058 const Real* const P3,

1059 const Real* const Qprod, // why?

1060 const Real* const link,

1061 int sig, int mu,

1062 Real coeff, Real accumu_coeff,

1063 Real* const shortP,

1064 Real* const newOprod

1065 )

1066 {

1067 // Need some way of setting half_volume

1068 int volume = 1;

1069 for(int dir=0; dir<4; ++dir) volume *= dim[dir];

1070 #ifdef MULTI_GPU

1071 const int loop_count = Vh_ex;

1072 #else

1073 const int loop_count = volume/2;

1074 #endif

1075 LoadStore<Real> ls(volume);

1076

1077 for(int site=0; site<loop_count; ++site){

1078 computeSideLinkSite<Real,0>(site, dim,

1079 P3, Qprod, link,

1080 sig, mu,

1081 coeff, accumu_coeff,

1082 ls, shortP, newOprod);

1083 }

1084

1085 for(int site=0; site<loop_count; ++site){

1086 computeSideLinkSite<Real,1>(site, dim,

1087 P3, Qprod, link,

1088 sig, mu,

1089 coeff, accumu_coeff,

1090 ls, shortP, newOprod);

1091 }

1092

1093 return;

1094 }

1095

1096

1097

1098

1099

1100 template<class Real, int oddBit>

1101 void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.

1102 const int dim[4],

1103 const Real* const oprod,

1104 const Real* const Qprev,

1105 const Real* const link,

1106 int sig, int mu,

1107 Real coeff, Real accumu_coeff,

1108 const LoadStore<Real>& ls, // pass a function object to read from and write to matrix fields

1109 Real* const shortP,

1110 Real* const newOprod)

1111 {

1112

1113 const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;

1114 const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;

1115

1116 typename ColorMatrix<Real>::Type ab_link, bc_link, ad_link;

1117 typename ColorMatrix<Real>::Type colorMatW, colorMatX, colorMatY, colorMatZ;

1118

1119

1120 int ab_link_nbr_idx, point_b, point_c, point_d;

1121

1122 Locator<oddBit> locator(dim);

1123 int X = locator.getFullFromHalfIndex(half_lattice_index);

1124

1125 int err;

1126 int new_mem_idx = locator.getNeighborFromFullIndex(X,OPP_DIR(mu), &err); RETURN_IF_ERR;

1127 point_d = new_mem_idx >> 1;

1128

1129 new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx,sig, &err); RETURN_IF_ERR;

1130 point_c = new_mem_idx >> 1;

1131

1132 new_mem_idx = locator.getNeighborFromFullIndex(X,sig, &err); RETURN_IF_ERR;

1133 point_b = new_mem_idx >> 1;

1134 ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;

1135

1136

1137

1138 Real mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;

1139

1140 if(mu_positive){

1141 ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatX);

1142 ls.loadMatrixFromField(link, 1-oddBit, mu, point_d, &ad_link);

1143 // compute point_c

1144 ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);

1145 ls.loadMatrixFromField(link, oddBit, mu, point_c, &bc_link);

1146 colorMatZ = conj(bc_link)*colorMatY; // okay

1147

1148 if(sig_positive)

1149 {

1150 colorMatY = colorMatX*ad_link;

1151 colorMatW = colorMatZ*colorMatY;

1152 ls.addMatrixToField(colorMatW, oddBit, sig, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);

1153 }

1154

1155 if(sig_positive){

1156 ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);

1157 }else{

1158 ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);

1159 }

1160 colorMatY = (sig_positive) ? ab_link*colorMatZ : conj(ab_link)*colorMatZ; // okay

1161 colorMatW = colorMatY*colorMatX;

1162 ls.addMatrixToField(colorMatW, 1-oddBit, mu, point_d, -Sign<oddBit>::result*mycoeff, newOprod);

1163 colorMatW = ad_link*colorMatY;

1164 ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP); // Warning! Need to check this!

1165 }else{ //negative mu

1166 mu = OPP_DIR(mu);

1167 ls.loadMatrixFromField(Qprev, 1-oddBit, point_d, &colorMatX);

1168 ls.loadMatrixFromField(link, oddBit, mu, half_lattice_index, &ad_link);

1169 ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);

1170 ls.loadMatrixFromField(link, 1-oddBit, mu, point_b, &bc_link);

1171

1172 if(sig_positive) colorMatW = colorMatX*conj(ad_link);

1173 colorMatZ = bc_link*colorMatY;

1174 if(sig_positive){

1175 colorMatY = colorMatZ*colorMatW;

1176 ls.addMatrixToField(colorMatY, oddBit, sig, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);

1177 }

1178

1179 if(sig_positive){

1180 ls.loadMatrixFromField(link, oddBit, sig, ab_link_nbr_idx, &ab_link);

1181 }else{

1182 ls.loadMatrixFromField(link, 1-oddBit, OPP_DIR(sig), ab_link_nbr_idx, &ab_link);

1183 }

1184

1185 colorMatY = (sig_positive) ? ab_link*colorMatZ : conj(ab_link)*colorMatZ; // 611

1186 colorMatW = conj(colorMatX)*conj(colorMatY);

1187

1188 ls.addMatrixToField(colorMatW, oddBit, mu, half_lattice_index, Sign<oddBit>::result*mycoeff, newOprod);

1189 colorMatW = conj(ad_link)*colorMatY;

1190 ls.addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP);

1191 } // end mu

1192 return;

1193 } // allLinkKernel

1194

1195

1196

1197 template<class Real>

1198 void computeAllLinkField(const int dim[4],

1199 const Real* const oprod,

1200 const Real* const Qprev,

1201 const Real* const link,

1202 int sig, int mu,

1203 Real coeff, Real accumu_coeff,

1204 Real* const shortP,

1205 Real* const newOprod)

1206 {

1207 int volume = 1;

1208 for(int dir=0; dir<4; ++dir) volume *= dim[dir];

1209 #ifdef MULTI_GPU

1210 const int loop_count = Vh_ex;

1211 #else

1212 const int loop_count = volume/2;

1213 #endif

1214

1215 LoadStore<Real> ls(volume);

1216 for(int site=0; site<loop_count; ++site){

1217

1218 computeAllLinkSite<Real,0>(site, dim,

1219 oprod, Qprev, link,

1220 sig, mu,

1221 coeff, accumu_coeff,

1222 ls,

1223 shortP, newOprod);

1224 }

1225

1226 for(int site=0; site<loop_count; ++site){

1227 computeAllLinkSite<Real, 1>(site, dim,

1228 oprod, Qprev, link,

1229 sig, mu,

1230 coeff, accumu_coeff,

1231 ls,

1232 shortP, newOprod);

1233 }

1234

1235 return;

1236 }

1237

1238 #define Pmu tempmat[0]

1239 #define P3 tempmat[1]

1240 #define P5 tempmat[2]

1241 #define Pnumu tempmat[3]

1242 #define Qmu tempmat[4]

1243 #define Qnumu tempmat[5]

1244

1245 template<class Real>

1246 struct PathCoefficients

1247 {

1248 Real one;

1249 Real three;

1250 Real five;

1251 Real seven;

1252 Real naik;

1253 Real lepage;

1254 };

1255

1256

1257 template<class Real>

1258 void doHisqStaplesForceCPU(const int dim[4],

1259 PathCoefficients<double> staple_coeff,

1260 Real* oprod,

1261 Real* link,

1262 Real** tempmat,

1263 Real* newOprod)

1264 {

1265 Real OneLink, ThreeSt, FiveSt, SevenSt, Lepage, coeff;

1266

1267 OneLink = staple_coeff.one;

1268 ThreeSt = staple_coeff.three;

1269 FiveSt = staple_coeff.five;

1270 SevenSt = staple_coeff.seven;

1271 Lepage = staple_coeff.lepage;

1272

1273 for(int sig=0; sig<4; ++sig){

1274 computeOneLinkField(dim,

1275 oprod,

1276 sig, OneLink,

1277 newOprod);

1278 }

1279

1280

1281 // sig labels the net displacement of the staple

1282 for(int sig=0; sig<8; ++sig){

1283 for(int mu=0; mu<8; ++mu){

1284 if( mu == sig || mu == OPP_DIR(sig) ) continue;

1285

1286 computeMiddleLinkField<Real>(dim,

1287 oprod, NULL, link,

1288 sig, mu, -ThreeSt,

1289 Pmu, P3, Qmu,

1290 newOprod);

1291

1292 for(int nu=0; nu<8; ++nu){

1293 if( nu==mu || nu==OPP_DIR(mu)

1294 || nu==sig || nu==OPP_DIR(sig) ) continue;

1295

1296 computeMiddleLinkField<Real>(dim,

1297 Pmu, Qmu, link,

1298 sig, nu, staple_coeff.five,

1299 Pnumu, P5, Qnumu,

1300 newOprod);

1301

1302

1303 for(int rho=0; rho<8; ++rho){

1304 if( rho == sig || rho == OPP_DIR(sig)

1305 || rho == mu || rho == OPP_DIR(mu)

1306 || rho == nu || rho == OPP_DIR(nu) )

1307 {

1308 continue;

1309 }

1310

1311 if(FiveSt != 0)coeff = SevenSt/FiveSt; else coeff = 0;

1312 computeAllLinkField<Real>(dim,

1313 Pnumu, Qnumu, link,

1314 sig, rho, staple_coeff.seven, coeff,

1315 P5, newOprod);

1316

1317 } // rho

1318

1319 // 5-staple: side link

1320 if(ThreeSt != 0)coeff = FiveSt/ThreeSt; else coeff = 0;

1321 computeSideLinkField<Real>(dim,

1322 P5, Qmu, link,

1323 sig, nu, -FiveSt, coeff,

1324 P3, newOprod);

1325

1326

1327 } // nu

1328

1329

1330 // lepage

1331 if(staple_coeff.lepage != 0.){

1332 computeMiddleLinkField<Real>(dim,

1333 Pmu, Qmu, link,

1334 sig, mu, Lepage,

1335 NULL, P5, NULL,

1336 newOprod);

1337

1338 if(ThreeSt != 0)coeff = Lepage/ThreeSt; else coeff = 0;

1339 computeSideLinkField<Real>(dim,

1340 P5, Qmu, link,

1341 sig, mu, -Lepage, coeff,

1342 P3, newOprod);

1343 } // lepage != 0

1344

1345

1346 computeSideLinkField<Real>(dim,

1347 P3, NULL, link,

1348 sig, mu, ThreeSt, 0.,

1349 NULL, newOprod);

1350 } // mu

1351 } // sig

1352

1353 // Need also to compute the one-link contribution

1354 return;

1355 }

1356

1357 #undef Pmu

1358 #undef P3

1359 #undef P5

1360 #undef Pnumu

1361 #undef Qmu

1362 #undef Qnumu

1363

1364

1365 void hisqStaplesForceCPU(const double* path_coeff,

1366 const QudaGaugeParam &param,

1367 cpuGaugeField &oprod,

1368 cpuGaugeField &link,

1369 cpuGaugeField* newOprod)

1370 {

1371 int volume = 1;

1372 for(int dir=0; dir<4; ++dir) volume *= param.X[dir];

1373

1374 #ifdef MULTI_GPU

1375 int len = Vh_ex*2;

1376 #else

1377 int len = volume;

1378 #endif

1379 // allocate memory for temporary fields

1380 void* tempmat[6];

1381 if(param.cpu_prec == QUDA_DOUBLE_PRECISION){

1382 for(int i=0; i<6; ++i) tempmat[i] = malloc(len*18*sizeof(double));

1383 }else{

1384 for(int i=0; i<6; ++i) tempmat[i] = malloc(len*18*sizeof(float));

1385 }

1386

1387 PathCoefficients<double> act_path_coeff;

1388 act_path_coeff.one = path_coeff[0];

1389 act_path_coeff.naik = path_coeff[1];

1390 act_path_coeff.three = path_coeff[2];

1391 act_path_coeff.five = path_coeff[3];

1392 act_path_coeff.seven = path_coeff[4];

1393 act_path_coeff.lepage = path_coeff[5];

1394

1395 if(param.cpu_prec == QUDA_DOUBLE_PRECISION){

1396 doHisqStaplesForceCPU<double>(param.X,

1397 act_path_coeff,

1398 (double*)oprod.Gauge_p(),

1399 (double*)link.Gauge_p(),

1400 (double**)tempmat,

1401 (double*)newOprod->Gauge_p()

1402 );

1403

1404 }else if(param.cpu_prec == QUDA_SINGLE_PRECISION){

1405 doHisqStaplesForceCPU<float>(param.X,

1406 act_path_coeff,

1407 (float*)oprod.Gauge_p(),

1408 (float*)link.Gauge_p(),

1409 (float**)tempmat,

1410 (float*)newOprod->Gauge_p()

1411 );

1412 }else{

1413 errorQuda("Unsupported precision");

1414 }

1415

1416 for(int i=0; i<6; ++i){

1417 free(tempmat[i]);

1418 }

1419 return;

1420 }

1421

1422

1423

1424 template<class Real, int oddBit>

1425 void computeLongLinkSite(int half_lattice_index,

1426 const int dim[4],

1427 const Real* const oprod,

1428 const Real* const link,

1429 int sig, Real coeff,

1430 const LoadStore<Real>& ls,

1431 Real* const output)

1432 {

1433 if( GOES_FORWARDS(sig) ){

1434

1435 Locator<oddBit> locator(dim);

1436

1437 typename ColorMatrix<Real>::Type ab_link, bc_link, de_link, ef_link;

1438 typename ColorMatrix<Real>::Type colorMatU, colorMatV, colorMatW, colorMatX, colorMatY, colorMatZ;

1439

1440 int point_a, point_b, point_c, point_d, point_e;

1441 #ifdef MULTI_GPU

1442 int idx = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);

1443 #else

1444 int idx = half_lattice_index;

1445 #endif

1446

1447 int X = locator.getFullFromHalfIndex(idx);

1448 point_c = idx;

1449

1450 int new_mem_idx = locator.getNeighborFromFullIndex(X,sig);

1451 point_d = new_mem_idx >> 1;

1452

1453 new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx, sig);

1454 point_e = new_mem_idx >> 1;

1455

1456 new_mem_idx = locator.getNeighborFromFullIndex(X, OPP_DIR(sig));

1457 point_b = new_mem_idx >> 1;

1458

1459 new_mem_idx = locator.getNeighborFromFullIndex(new_mem_idx, OPP_DIR(sig));

1460 point_a = new_mem_idx >> 1;

1461

1462 ls.loadMatrixFromField(link, oddBit, sig, point_a, &ab_link);

1463 ls.loadMatrixFromField(link, 1-oddBit, sig, point_b, &bc_link);

1464 ls.loadMatrixFromField(link, 1-oddBit, sig, point_d, &de_link);

1465 ls.loadMatrixFromField(link, oddBit, sig, point_e, &ef_link);

1466

1467 ls.loadMatrixFromField(oprod, oddBit, sig, point_c, &colorMatZ);

1468 ls.loadMatrixFromField(oprod, 1-oddBit, sig, point_b, &colorMatY);

1469 ls.loadMatrixFromField(oprod, oddBit, sig, point_a, &colorMatX);

1470

1471 colorMatV = de_link*ef_link*colorMatZ

1472 - de_link*colorMatY*bc_link

1473 + colorMatX*ab_link*bc_link;

1474

1475 ls.addMatrixToField(colorMatV, oddBit, sig, point_c, coeff, output);

1476 }

1477 return;

1478 }

1479

1480 template<class Real>

1481 void computeLongLinkField(const int dim[4],

1482 const Real* const oprod,

1483 const Real* const link,

1484 int sig, Real coeff,

1485 Real* const output)

1486 {

1487 int volume = 1;

1488 for(int dir=0; dir<4; ++dir) volume *= dim[dir];

1489 const int half_volume = volume/2;

1490

1491 LoadStore<Real> ls(volume);

1492 for(int site=0; site<half_volume; ++site){

1493 computeLongLinkSite<Real,0>(site,

1494 dim,

1495 oprod,

1496 link,

1497 sig, coeff, ls,

1498 output);

1499

1500 }

1501 // Loop over odd lattice sites

1502 for(int site=0; site<half_volume; ++site){

1503 computeLongLinkSite<Real,1>(site,

1504 dim,

1505 oprod,

1506 link,

1507 sig, coeff, ls,

1508 output);

1509 }

1510 return;

1511 }

1512

1513

1514 void hisqLongLinkForceCPU(double coeff,

1515 const QudaGaugeParam &param,

1516 cpuGaugeField &oprod,

1517 cpuGaugeField &link,

1518 cpuGaugeField *newOprod)

1519 {

1520 for(int sig=0; sig<4; ++sig){

1521 if(param.cpu_prec == QUDA_SINGLE_PRECISION){

1522 computeLongLinkField<float>(param.X,

1523 (float*)oprod.Gauge_p(),

1524 (float*)link.Gauge_p(),

1525 sig, coeff,

1526 (float*)newOprod->Gauge_p());

1527 }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){

1528 computeLongLinkField<double>(param.X,

1529 (double*)oprod.Gauge_p(),

1530 (double*)link.Gauge_p(),

1531 sig, coeff,

1532 (double*)newOprod->Gauge_p());

1533 }else {

1534 errorQuda("Unrecognised precision\n");

1535 }

1536 } // sig

1537 return;

1538 }

1539

1540

1541 template<class Real, int oddBit>

1542 void completeForceSite(int half_lattice_index,

1543 const int dim[4],

1544 const Real* const oprod,

1545 const Real* const link,

1546 int sig,

1547 const LoadStore<Real>& ls,

1548 Real* const mom)

1549 {

1550

1551 typename ColorMatrix<Real>::Type colorMatX, colorMatY, linkW;

1552

1553 #ifdef MULTI_GPU

1554 int half_lattice_index_ex = ls.half_idx_conversion_normal2ex(half_lattice_index, dim, oddBit);

1555 int idx = half_lattice_index_ex;

1556 #else

1557 int idx = half_lattice_index;

1558 #endif

1559 ls.loadMatrixFromField(link, oddBit, sig, idx, &linkW);

1560 ls.loadMatrixFromField(oprod, oddBit, sig, idx, &colorMatX);

1561

1562 const Real coeff = (oddBit) ? -1 : 1;

1563 colorMatY = linkW*colorMatX;

1564

1565 ls.storeMatrixToMomentumField(colorMatY, oddBit, sig, half_lattice_index, coeff, mom);

1566 return;

1567 }

1568

1569 template <class Real>

1570 void completeForceField(const int dim[4],

1571 const Real* const oprod,

1572 const Real* const link,

1573 int sig,

1574 Real* const mom)

1575 {

1576 int volume = dim[0]*dim[1]*dim[2]*dim[3];

1577 const int half_volume = volume/2;

1578 LoadStore<Real> ls(volume);

1579

1580

1581 for(int site=0; site<half_volume; ++site){

1582 completeForceSite<Real,0>(site,

1583 dim,

1584 oprod, link,

1585 sig,

1586 ls,

1587 mom);

1588

1589 }

1590 for(int site=0; site<half_volume; ++site){

1591 completeForceSite<Real,1>(site,

1592 dim,

1593 oprod, link,

1594 sig,

1595 ls,

1596 mom);

1597 }

1598 return;

1599 }

1600

1601

1602 void hisqCompleteForceCPU(const QudaGaugeParam &param,

1603 cpuGaugeField &oprod,

1604 cpuGaugeField &link,

1605 cpuGaugeField* mom)

1606 {

1607 for(int sig=0; sig<4; ++sig){

1608 if(param.cpu_prec == QUDA_SINGLE_PRECISION){

1609 completeForceField<float>(param.X,

1610 (float*)oprod.Gauge_p(),

1611 (float*)link.Gauge_p(),

1612 sig,

1613 (float*)mom->Gauge_p());

1614 }else if(param.cpu_prec == QUDA_DOUBLE_PRECISION){

1615 completeForceField<double>(param.X,

1616 (double*)oprod.Gauge_p(),

1617 (double*)link.Gauge_p(),

1618 sig,

1619 (double*)mom->Gauge_p());

1620 }else{

1621 errorQuda("Unrecognised precision\n");

1622 }

1623 } // loop over sig

1624 return;

1625 }

1626

1627

1628 //} // namespace quda

1629

1630

1631

1632

LoadStore

Definition: hisq_force_reference2.cpp:311

LoadStore::storeMatrixToMomentumField

void storeMatrixToMomentumField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real *const) const

Definition: hisq_force_reference2.cpp:543

LoadStore::getData

Real getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const

Definition: hisq_force_reference2.cpp:396

LoadStore::loadMatrixFromField

void loadMatrixFromField(const Real *const field, int oddBit, int half_lattice_index, Matrix< 3, std::complex< Real > > *const mat) const

Definition: hisq_force_reference2.cpp:417

LoadStore::half_idx_conversion_ex2normal

int half_idx_conversion_ex2normal(int half_lattice_index, const int *dim, int oddBit) const

Definition: hisq_force_reference2.cpp:336

LoadStore::half_idx_conversion_normal2ex

int half_idx_conversion_normal2ex(int half_lattice_index, const int *dim, int oddBit) const

Definition: hisq_force_reference2.cpp:365

LoadStore::storeMatrixToField

void storeMatrixToField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int half_lattice_index, Real *const field) const

Definition: hisq_force_reference2.cpp:465

LoadStore::addData

void addData(Real *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const

Definition: hisq_force_reference2.cpp:405

LoadStore::LoadStore

LoadStore(int vol)

Definition: hisq_force_reference2.cpp:316

LoadStore::addMatrixToField

void addMatrixToField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int half_lattice_index, Real coeff, Real *const) const

Definition: hisq_force_reference2.cpp:487

Matrix::operator()

T & operator()(int i, int j)

Matrix::Matrix

Matrix()

Matrix::operator-=

Matrix & operator-=(const Matrix< N, T > &mat)

Definition: hisq_force_reference2.cpp:189

Matrix::operator+=

Matrix & operator+=(const Matrix< N, T > &mat)

Definition: hisq_force_reference2.cpp:178

Matrix::operator()

const T & operator()(int i, int j) const

quda::Matrix

Definition: quda_matrix.h:63

quda::Matrix::data

T data[N *N]

Definition: quda_matrix.h:70

quda::Matrix::Matrix

__device__ __host__ Matrix()

Definition: quda_matrix.h:74

quda::Matrix::operator()

__device__ __host__ T const & operator()(int i, int j) const

Definition: quda_matrix.h:96

quda::cpuGaugeField

Definition: gauge_field.h:626

quda::cpuGaugeField::Gauge_p

void * Gauge_p()

Definition: gauge_field.h:688

dim

std::array< int, 4 > dim

Definition: command_line_params.cpp:34

double mu

Definition: command_line_params.cpp:73

mat

void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)

Definition: covdev_reference.cpp:109

QUDA_DOUBLE_PRECISION

@ QUDA_DOUBLE_PRECISION

Definition: enum_quda.h:65

QUDA_SINGLE_PRECISION

@ QUDA_SINGLE_PRECISION

Definition: enum_quda.h:64

QUDA_MILC_GAUGE_ORDER

@ QUDA_MILC_GAUGE_ORDER

Definition: enum_quda.h:47

gauge_field.h

#define P3

Definition: hisq_force_reference2.cpp:1239

completeForceField

void completeForceField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real *const mom)

Definition: hisq_force_reference2.cpp:1570

Qmu

#define Qmu

Definition: hisq_force_reference2.cpp:1242

computeLongLinkField

void computeLongLinkField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, Real *const output)

Definition: hisq_force_reference2.cpp:1481

int Vh

Definition: host_utils.cpp:38

computeAllLinkField

void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)

Definition: hisq_force_reference2.cpp:1198

computeAllLinkSite

void computeAllLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore< Real > &ls, Real *const shortP, Real *const newOprod)

Definition: hisq_force_reference2.cpp:1101

completeForceSite

void completeForceSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, int sig, const LoadStore< Real > &ls, Real *const mom)

Definition: hisq_force_reference2.cpp:1542

computeSideLinkSite

void computeSideLinkSite(int half_lattice_index, const int dim[4], const Real *const P3, const Real *const Qprod, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore< Real > &ls, Real *const shortP, Real *const newOprod)

Definition: hisq_force_reference2.cpp:980

#define P5

Definition: hisq_force_reference2.cpp:1240

doHisqStaplesForceCPU

void doHisqStaplesForceCPU(const int dim[4], PathCoefficients< double > staple_coeff, Real *oprod, Real *link, Real **tempmat, Real *newOprod)

Definition: hisq_force_reference2.cpp:1258

Pnumu

#define Pnumu

Definition: hisq_force_reference2.cpp:1241

computeLongLinkSite

void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, const LoadStore< Real > &ls, Real *const output)

Definition: hisq_force_reference2.cpp:1425

hisqStaplesForceCPU

void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam &param, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *newOprod)

Definition: hisq_force_reference2.cpp:1365

computeMiddleLinkField

void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)

Definition: hisq_force_reference2.cpp:934

transpose

Matrix< N, T > transpose(const Matrix< N, std::complex< T > > &mat)

Definition: hisq_force_reference2.cpp:243

RETURN_IF_ERR

#define RETURN_IF_ERR

Definition: hisq_force_reference2.cpp:13

Pmu

#define Pmu

Definition: hisq_force_reference2.cpp:1238

computeMiddleLinkSite

void computeMiddleLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, const LoadStore< Real > &ls, Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)

Definition: hisq_force_reference2.cpp:834

gauge_order

int gauge_order

Definition: hisq_paths_force_test.cpp:34

Qnumu

#define Qnumu

Definition: hisq_force_reference2.cpp:1243

computeOneLinkField

void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Real coeff, Real *const output)

Definition: hisq_force_reference2.cpp:798

Vh_ex

int Vh_ex

Definition: host_utils.cpp:46

computeSideLinkField

void computeSideLinkField(const int dim[4], const Real *const P3, const Real *const Qprod, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)

Definition: hisq_force_reference2.cpp:1057

hisqCompleteForceCPU

void hisqCompleteForceCPU(const QudaGaugeParam &param, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *mom)

Definition: hisq_force_reference2.cpp:1602

conj

Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)

Definition: hisq_force_reference2.cpp:231

hisqLongLinkForceCPU

void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam &param, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *newOprod)

Definition: hisq_force_reference2.cpp:1514

computeOneLinkSite

void computeOneLinkSite(const int dim[4], int half_lattice_index, const Real *const oprod, int sig, Real coeff, const LoadStore< Real > &ls, Real *const output)

Definition: hisq_force_reference2.cpp:777

E1h

int E1h

Definition: host_utils.cpp:44

int E1

Definition: host_utils.cpp:44

int E2

Definition: host_utils.cpp:44

int E4

Definition: host_utils.cpp:44

int E3

Definition: host_utils.cpp:44