1 #include <cstdio>
2 #include <cstdlib>
3 #include <iostream>
4 #include <iomanip>
5 #include <complex>
6
9
11 //namespace quda {
12
13 #define RETURN_IF_ERR if(err) return;
14
18
19 static int OPP_DIR(
int dir){
return 7-dir; }
21
22 template<int N>
24 static const int result = 1;
25 };
26
27 template<>
29 {
30 static const int result = -1;
31 };
32
33
34 template<class T, class U>
36 {
38 };
39
40 template<>
42 {
44 };
45
46 template<>
48 {
50 };
51
52 template<>
54 {
56 };
57
58 template<>
60 {
62 };
63
64 template<>
66 {
68 };
69
70 template<>
72 {
74 };
75
76 template<>
78 {
79 typedef std::complex<float>
Type;
80 };
81
82 template<>
84 {
85 typedef std::complex<float>
Type;
86 };
87
88 template<>
90 {
91 typedef std::complex<float>
Type;
92 };
93
94 template<>
96 {
97 typedef std::complex<double>
Type;
98 };
99
100 template<>
102 {
103 typedef std::complex<double>
Type;
104 };
105
106 template<>
108 {
109 typedef std::complex<double>
Type;
110 };
111
112 template<>
114 {
115 typedef std::complex<double>
Type;
116 };
117
118 template<>
120 {
121 typedef std::complex<double>
Type;
122 };
123
124 template<>
126 {
127 typedef std::complex<double>
Type;
128 };
129
130 template<int N, class T>
132 {
133 private:
135
136 public:
143 };
144
145 template<int N, class T>
147 {
148 for(int i=0; i<N; ++i){
149 for(int j=0; j<N; ++j){
150 data[i][j] = static_cast<T>(0);
151 }
152 }
153 }
154
155 template<int N, class T>
157 {
158 for(int i=0; i<N; ++i){
159 for(int j=0; j<N; ++j){
160 data[i][j] =
mat.data[i][j];
161 }
162 }
163 }
164
165 template<int N, class T>
167 {
168 return data[i][j];
169 }
170
171 template<int N, class T>
173 {
174 return data[i][j];
175 }
176
177 template<int N, class T>
179 {
180 for(int i=0; i<N; ++i){
181 for(int j=0; j<N; ++j){
182 data[i][j] +=
mat.data[i][j];
183 }
184 }
185 return *this;
186 }
187
188 template<int N, class T>
190 {
191 for(int i=0; i<N; ++i){
192 for(int j=0; j<N; ++j){
193 data[i][j] -=
mat.data[i][j];
194 }
195 }
196 return *this;
197 }
198
199 template<int N, class T>
201 {
203 result += b;
204 return result;
205 }
206
207 template<int N, class T>
209 {
211 result -= b;
212 return result;
213 }
214
215 template<int N, class T>
217 {
219 for(int i=0; i<N; ++i){
220 for(int j=0; j<N; ++j){
221 result(i,j) = static_cast<T>(0);
222 for(int k=0; k<N; ++k){
223 result(i,j) += a(i,k)*b(k,j);
224 }
225 }
226 }
227 return result;
228 }
229
230 template<int N, class T>
232 {
234 for(int i=0; i<N; ++i){
235 for(int j=0; j<N; ++j){
237 }
238 }
239 return result;
240 }
241
242 template<int N, class T>
244 {
246 for(int i=0; i<N; ++i){
247 for(int j=0; j<N; ++j){
248 result(i,j) =
mat(j,i);
249 }
250 }
251 return result;
252 }
253
254 template<int N, class T, class U>
256 {
259
260 for(int i=0; i<N; ++i){
261 for(int j=0; j<N; ++j){
263 }
264 }
265 return result;
266 }
267
268 template<int N, class T, class U>
270 {
272 }
273
274 template<int N, class T>
276 {
279 for(int i=0; i<N; ++i){
280 id(i,i) = static_cast<T>(1);
281 }
282 return id;
283 } // operator()
284 };
285
286 template<int N, class T>
288 {
289 // the default constructor zeros all matrix elements
292 }
293 };
294
295
296 template<int N, class T>
298 {
299 for(int i=0; i<N; ++i){
300 for(int j=0; j<N; ++j){
301 os << m(i,j) << " ";
302 }
303 if(i<N-1) os << std::endl;
304 }
305 return os;
306 }
307
308
309
310 template<class Real>
312 private:
313 const int volume;
314 const int half_volume;
315 public:
317
318 void loadMatrixFromField(
const Real*
const field,
int oddBit,
int half_lattice_index,
Matrix<3, std::complex<Real> >*
const mat)
const;
319
320 void loadMatrixFromField(
const Real*
const field,
int oddBit,
int dir,
int half_lattice_index,
Matrix<3, std::complex<Real> >*
const mat)
const;
321
322 void storeMatrixToField(
const Matrix<3, std::complex<Real> >&
mat,
int oddBit,
int half_lattice_index, Real*
const field)
const;
323
324 void addMatrixToField(
const Matrix<3, std::complex<Real> >&
mat,
int oddBit,
int half_lattice_index, Real coeff, Real*
const)
const;
325
326 void addMatrixToField(
const Matrix<3, std::complex<Real> >&
mat,
int oddBit,
int dir,
int half_lattice_index, Real coeff, Real*
const)
const;
327
328 void storeMatrixToMomentumField(
const Matrix<3, std::complex<Real> >&
mat,
int oddBit,
int dir,
int half_lattice_index, Real coeff, Real*
const)
const;
329 Real getData(const Real* const field, int idx, int dir, int oddBit, int offset, int hfv) const;
330 void addData(Real* const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const;
331 int half_idx_conversion_ex2normal(
int half_lattice_index,
const int*
dim,
int oddBit)
const ;
332 int half_idx_conversion_normal2ex(
int half_lattice_index,
const int*
dim,
int oddBit)
const ;
333 };
334
335 template<class Real>
337 {
341 //int X4=dim[3];
342
346 //int E4=dim[3]+4;
348
349 int sid = half_lattice_index_ex;
350
352 int x1h = sid - za*
E1h;
357 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
358 int x1 = 2*x1h + x1odd;
359
360 int idx = ((x4-2)*X3*X2*X1 + (x3-2)*X2*X1+(x2-2)*X1+(x1-2))/2;
361 return idx;
362 }
363
364 template<class Real>
366 {
370 //int X4=dim[3];
371 int X1h=X1/2;
372
376 //int E4=dim[3]+4;
377 //int E1h=E1/2;
378
379 int sid = half_lattice_index;
380
381 int za = sid/X1h;
382 int x1h = sid - za*X1h;
383 int zb = za/X2;
384 int x2 = za - zb*X2;
385 int x4 = zb/X3;
386 int x3 = zb - x4*X3;
387 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
388 int x1 = 2*x1h + x1odd;
389
390 int idx = ((x4+2)*
E3*
E2*
E1 + (x3+2)*
E2*
E1+(x2+2)*
E1+(x1+2))/2;
391
392 return idx;
393 }
394
395 template<class Real>
397 {
399 return field[(4*hfv*oddBit +4*idx + dir)*18+offset];
400 }else{ //QDP format
401 return ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset];
402 }
403 }
404 template<class Real>
406 {
408 field[(4*hfv*oddBit +4*idx + dir)*18+offset] += v;
409 }else{ //QDP format
410 ((Real**)field)[dir][(hfv*oddBit+idx)*18 +offset] += v;
411 }
412 }
413
414
415
416 template<class Real>
418 int oddBit,
419 int half_lattice_index,
420 Matrix<3, std::complex<Real> >*
const mat
421 ) const
422 {
423 #ifdef MULTI_GPU
425 #else
427 #endif
428
429 int offset = 0;
430 for(int i=0; i<3; ++i){
431 for(int j=0; j<3; ++j){
432 (*mat)(i,j) = (*(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
433 (*mat)(i,j) += std::complex<Real>(0, *(field + (oddBit*hfv + half_lattice_index)*18 + offset++));
434 }
435 }
436 return;
437 }
438
439 template<class Real>
441 int oddBit,
442 int dir,
443 int half_lattice_index,
444 Matrix<3, std::complex<Real> >*
const mat
445 ) const
446 {
447 #ifdef MULTI_GPU
449 #else
451 #endif
452
453 //const Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
454 int offset = 0;
455 for(int i=0; i<3; ++i){
456 for(int j=0; j<3; ++j){
457 (*mat)(i,j) = (getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
458 (*mat)(i,j) += std::complex<Real>(0, getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
459 }
460 }
461 return;
462 }
463
464 template<class Real>
466 int oddBit,
467 int half_lattice_index,
468 Real* const field) const
469 {
470 #ifdef MULTI_GPU
472 #else
474 #endif
475
476 int offset = 0;
477 for(int i=0; i<3; ++i){
478 for(int j=0; j<3; ++j){
479 *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (
mat)(i,j).real();
480 *(field + (oddBit*hfv + half_lattice_index)*18 + offset++) = (
mat)(i,j).imag();
481 }
482 }
483 return;
484 }
485
486 template<class Real>
488 int oddBit,
489 int half_lattice_index,
490 Real coeff,
491 Real* const field) const
492 {
493 #ifdef MULTI_GPU
495 #else
497 #endif
498 Real* const local_field = field + (oddBit*hfv + half_lattice_index)*18;
499
500
501 int offset = 0;
502 for(int i=0; i<3; ++i){
503 for(int j=0; j<3; ++j){
504 local_field[offset++] += coeff*
mat(i,j).real();
505 local_field[offset++] += coeff*
mat(i,j).imag();
506 }
507 }
508 return;
509 }
510
511
512 template<class Real>
514 int oddBit,
515 int dir,
516 int half_lattice_index,
517 Real coeff,
518 Real* const field) const
519 {
520
521 #ifdef MULTI_GPU
523 #else
525 #endif
526
527 //Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
528 int offset = 0;
529 for(int i=0; i<3; ++i){
530 for(int j=0; j<3; ++j){
531 //local_field[offset++] += coeff*mat(i,j).real();
532 addData(field, half_lattice_index, dir, oddBit, offset++, coeff*
mat(i,j).real(), hfv);
533
534 //local_field[offset++] += coeff*mat(i,j).imag();
535 addData(field, half_lattice_index, dir, oddBit, offset++, coeff*
mat(i,j).imag(), hfv);
536 }
537 }
538 return;
539 }
540
541
542 template<class Real>
544 int oddBit,
545 int dir,
546 int half_lattice_index,
547 Real coeff,
548 Real* const field) const
549 {
550 Real* const mom_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*10;
551 mom_field[0] = (
mat(0,1).real() -
mat(1,0).real())*0.5*coeff;
552 mom_field[1] = (
mat(0,1).imag() +
mat(1,0).imag())*0.5*coeff;
553
554 mom_field[2] = (
mat(0,2).real() -
mat(2,0).real())*0.5*coeff;
555 mom_field[3] = (
mat(0,2).imag() +
mat(2,0).imag())*0.5*coeff;
556
557 mom_field[4] = (
mat(1,2).real() -
mat(2,1).real())*0.5*coeff;
558 mom_field[5] = (
mat(1,2).imag() +
mat(2,1).imag())*0.5*coeff;
559
560 const Real temp = (
mat(0,0).imag() +
mat(1,1).imag() +
mat(2,2).imag())*0.3333333333333333333;
561 mom_field[6] = (
mat(0,0).imag() - temp)*coeff;
562 mom_field[7] = (
mat(1,1).imag() - temp)*coeff;
563 mom_field[8] = (
mat(2,2).imag() - temp)*coeff;
564 mom_field[9] = 0.0;
565 return;
566 }
567
568
569
570 template<int oddBit>
572 {
573 private:
574 int local_dim[4];
575 int volume;
576 int half_index;
577 int full_index;
578 int full_coord[4];
579 void getCoordsFromHalfIndex(int half_index, int coord[4]);
580 void getCoordsFromFullIndex(int full_index, int coord[4]);
581 void cache(int half_lattice_index); // caches the half-lattice index, full-lattice index,
582 // and full-lattice coordinates
583
584
585 public:
587 int getFullFromHalfIndex(int half_lattice_index);
588 int getNeighborFromFullIndex(int full_lattice_index, int dir, int* err=NULL);
589 };
590
591 template<int oddBit>
593 volume = 1;
594 for(int dir=0; dir<4; ++dir){
595 local_dim[dir] =
dim[dir];
596 volume *= local_dim[dir];
597 }
598 }
599
600 // Store the half_lattice index, works out and stores the full lattice index
601 // and the coordinates.
602 template<int oddBit>
604 {
605 #ifdef MULTI_GPU
606 int E1 = local_dim[0]+4;
607 int E2 = local_dim[1]+4;
608 int E3 = local_dim[2]+4;
609 //int E4 = local_dim[3]+4;
611
612 int z1 = half_lattice_index/
E1h;
613 int x1h = half_lattice_index - z1*
E1h;
615 coord[1] = z1 - z2*
E2;
617 coord[2] = z2 - coord[3]*
E3;
618 int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;
619 coord[0] = 2*x1h + x1odd;
620 #else
621 int half_dim_0 = local_dim[0]/2;
622 int z1 = half_lattice_index/half_dim_0;
623 int x1h = half_lattice_index - z1*half_dim_0;
624 int z2 = z1/local_dim[1];
625 coord[1] = z1 - z2*local_dim[1];
626 coord[3] = z2/local_dim[2];
627 coord[2] = z2 - coord[3]*local_dim[2];
628 int x1odd = (coord[1] + coord[2] + coord[3] + oddBit) & 1;
629 coord[0] = 2*x1h + x1odd;
630 #endif
631
632 }
633
634 template<int oddBit>
636 {
637 #ifdef MULTI_GPU
638 int D1=local_dim[0]+4;
639 int D2=local_dim[1]+4;
640 int D3=local_dim[2]+4;
641 //int D4=local_dim[3]+4;
642 //int D1h=D1/2;
643 #else
644 int D1=local_dim[0];
645 int D2=local_dim[1];
646 int D3=local_dim[2];
647 //int D4=local_dim[3];
648 //int D1h=D1/2;
649 #endif
650
651 int z1 = full_lattice_index/D1;
652 coord[0] = full_lattice_index - z1*D1;
653 int z2 = z1/D2;
654 coord[1] = z1 - z2*D2;
655 coord[3] = z2/D3;
656 coord[2] = z2 - coord[3]*D3;
657 }
658
659
660
661
662
663 template<int oddBit>
665 {
666 half_index = half_lattice_index;
667 getCoordsFromHalfIndex(half_lattice_index, full_coord);
668 int x1odd = (full_coord[1] + full_coord[2] + full_coord[3] + oddBit) & 1;
669 full_index = 2*half_lattice_index + x1odd;
670 return;
671 }
672
673 template<int oddBit>
675 {
676 if(half_index != half_lattice_index) cache(half_lattice_index);
677 return full_index;
678 }
679
680 // From full index return the neighbouring full index
681 template<int oddBit>
683 {
684 if(err) *err = 0;
685
686 int coord[4];
687 int neighbor_index;
688 getCoordsFromFullIndex(full_lattice_index, coord);
689 #ifdef MULTI_GPU
690 int E1 = local_dim[0] + 4;
691 int E2 = local_dim[1] + 4;
692 int E3 = local_dim[2] + 4;
693 int E4 = local_dim[3] + 4;
694 switch(dir){
695 case 0: //+X
696 neighbor_index = full_lattice_index + 1;
697 if(err && (coord[0] ==
E1-1) ) *err = 1;
698 break;
699 case 1: //+Y
700 neighbor_index = full_lattice_index +
E1;
701 if(err && (coord[1] ==
E2-1) ) *err = 1;
702 break;
703 case 2: //+Z
704 neighbor_index = full_lattice_index +
E2*
E1;
705 if(err && (coord[2] ==
E3-1) ) *err = 1;
706 break;
707 case 3: //+T
708 neighbor_index = full_lattice_index +
E3*
E2*
E1;
709 if(err && (coord[3] ==
E4-1) ) *err = 1;
710 break;
711 case 7: //-X
712 neighbor_index = full_lattice_index - 1;
713 if(err && (coord[0] == 0) ) *err = 1;
714 break;
715 case 6: //-Y
716 neighbor_index = full_lattice_index -
E1;
717 if(err && (coord[1] == 0) ) *err = 1;
718 break;
719 case 5: //-Z
720 neighbor_index = full_lattice_index -
E2*
E1;
721 if(err && (coord[2] == 0) ) *err = 1;
722 break;
723 case 4: //-T
724 neighbor_index = full_lattice_index -
E3*
E2*
E1;
725 if(err && (coord[3] == 0) ) *err = 1;
726 break;
727 default:
728 errorQuda(
"Neighbor index could not be determined\n");
729 exit(1);
730 break;
731 } // switch(dir)
732
733 #else
734 switch(dir){
735 case 0:
736 neighbor_index = (coord[0] == local_dim[0]-1) ? full_lattice_index + 1 - local_dim[0] : full_lattice_index + 1;
737 break;
738 case 1:
739 neighbor_index = (coord[1] == local_dim[1]-1) ? full_lattice_index + local_dim[0]*(1 - local_dim[1]) : full_lattice_index + local_dim[0];
740 break;
741 case 2:
742 neighbor_index = (coord[2] == local_dim[2]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index + local_dim[0]*local_dim[1];
743 break;
744 case 3:
745 neighbor_index = (coord[3] == local_dim[3]-1) ? full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2]*(1-local_dim[3]) : full_lattice_index + local_dim[0]*local_dim[1]*local_dim[2];
746 break;
747 case 7:
748 neighbor_index = (coord[0] == 0) ? full_lattice_index - 1 + local_dim[0] : full_lattice_index - 1;
749 break;
750 case 6:
751 neighbor_index = (coord[1] == 0) ? full_lattice_index - local_dim[0]*(1 - local_dim[1]) : full_lattice_index - local_dim[0];
752 break;
753 case 5:
754 neighbor_index = (coord[2] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*(1 - local_dim[2]) : full_lattice_index - local_dim[0]*local_dim[1];
755 break;
756 case 4:
757 neighbor_index = (coord[3] == 0) ? full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2]*(1 - local_dim[3]) : full_lattice_index - local_dim[0]*local_dim[1]*local_dim[2];
758 break;
759 default:
760 errorQuda(
"Neighbor index could not be determined\n");
761 exit(1);
762 break;
763 } // switch(dir)
764 if(err) *err = 0;
765 #endif
766 return neighbor_index;
767 }
768
769 // Can't typedef a template
770 template<class Real>
772 {
774 };
775
776 template<class Real, int oddBit>
778 int half_lattice_index,
779 const Real* const oprod,
780 int sig, Real coeff,
782 Real* const output)
783 {
786 #ifdef MULTI_GPU
788 #else
789 int idx = half_lattice_index;
790 #endif
793 }
794 return;
795 }
796
797 template<class Real>
799 const Real* const oprod,
800 int sig, Real coeff,
801 Real* const output)
802 {
803 int volume = 1;
804 for(
int dir=0; dir<4; ++dir) volume *=
dim[dir];
805 const int half_volume = volume/2;
807 for(int site=0; site<half_volume; ++site){
808 computeOneLinkSite<Real,0>(
dim, site,
809 oprod,
810 sig, coeff, ls,
811 output);
812
813 }
814 // Loop over odd lattice sites
815 for(int site=0; site<half_volume; ++site){
816 computeOneLinkSite<Real,1>(
dim, site,
817 oprod,
818 sig, coeff, ls,
819 output);
820 }
821 return;
822 }
823
824
825
826
827
828
829
830
831
832 // middleLinkKernel compiles for now, but lots of debugging to be done
833 template<class Real, int oddBit>
836 const Real* const oprod,
837 const Real* const Qprev,
838 const Real* const link,
840 Real coeff,
841 const LoadStore<Real>& ls,
// pass a function object to read from and write to matrix fields
845 Real* const newOprod
846 )
847 {
849 const bool sig_positive = (
GOES_FORWARDS(sig)) ?
true :
false;
850
851
853 int point_b, point_c, point_d;
854 int ad_link_nbr_idx, ab_link_nbr_idx, bc_link_nbr_idx;
856
857 int err;
859 point_d = new_mem_idx >> 1;
860 // getNeighborFromFullIndex will work on any site on the lattice, odd or even
862 point_c = new_mem_idx >> 1;
863
865 point_b = new_mem_idx >> 1;
866
867 ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
868 bc_link_nbr_idx = (mu_positive) ? point_c : point_b;
869 ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
870
873
874
875
876
877
878 if(sig_positive){
880 }else{
882 }
883
884 if(mu_positive){
886 }else{
888 }
889
890 if(Qprev == NULL){
891 if(sig_positive){
893 }else{
895 colorMatY =
conj(colorMatY);
896 }
897 }else{ // Qprev != NULL
899 }
900
901 colorMatW = (!mu_positive) ? bc_link*colorMatY :
conj(bc_link)*colorMatY;
903
904 colorMatY = (sig_positive) ? ab_link*colorMatW :
conj(ab_link)*colorMatW;
906
907 if(mu_positive){
909 }else{
911 ad_link =
conj(ad_link);
912 }
913
914
915 if(Qprev == NULL){
916 if(sig_positive) colorMatY = colorMatW*ad_link;
918 }else{ // Qprev != NULL
919 if(
Qmu || sig_positive){
921 colorMatX= colorMatY*ad_link;
922 }
924 if(sig_positive) colorMatY = colorMatW*colorMatX;
925 }
926
927 if(sig_positive) ls.
addMatrixToField(colorMatY, oddBit, sig, half_lattice_index, coeff, newOprod);
928
929 return;
930 } // computeMiddleLinkSite
931
932
933 template<class Real>
935 const Real* const oprod,
936 const Real* const Qprev,
937 const Real* const link,
939 Real coeff,
943 Real* const newOprod
944 )
945 {
946
947 int volume = 1;
948 for(
int dir=0; dir<4; ++dir) volume *=
dim[dir];
949 #ifdef MULTI_GPU
950 const int loop_count =
Vh_ex;
951 #else
952 const int loop_count = volume/2;
953 #endif
954 // loop over the lattice volume
955 // To keep the code as close to the GPU code as possible, we'll
956 // loop over the even sites first and then the odd sites
958 for(int site=0; site<loop_count; ++site){
959 computeMiddleLinkSite<Real, 0>(site,
dim,
960 oprod, Qprev, link,
962 ls,
964 }
965 // Loop over odd lattice sites
966 for(int site=0; site<loop_count; ++site){
967 computeMiddleLinkSite<Real,1>(site,
dim,
968 oprod, Qprev, link,
970 ls,
972 }
973 return;
974 }
975
976
977
978
979 template<class Real, int oddBit>
982 const Real*
const P3,
983 const Real* const Qprod, // why?
984 const Real* const link,
986 Real coeff, Real accumu_coeff,
987 const LoadStore<Real>& ls,
// pass a function object to read from and write to matrix fields
988 Real* const shortP,
989 Real* const newOprod
990 )
991 {
992
994 const bool sig_positive = (
GOES_FORWARDS(sig)) ?
true :
false;
995
997 int point_d;
998 int ad_link_nbr_idx;
1000
1001 int err;
1003 point_d = new_mem_idx >> 1;
1004 ad_link_nbr_idx = (mu_positive) ? point_d : half_lattice_index;
1005
1006
1009
1011
1012 if(shortP){
1013 if(mu_positive){
1014 ad_link_nbr_idx = point_d;
1016 }else{
1017 ad_link_nbr_idx = half_lattice_index;
1019 }
1020 colorMatW = (mu_positive) ? ad_link*colorMatY :
conj(ad_link)*colorMatY;
1022 } // if(shortP)
1023
1024
1025 Real mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;
1026
1027 if(Qprod){
1029 if(mu_positive){
1030 colorMatW = colorMatY*colorMatX;
1031 if(!oddBit){ mycoeff = -mycoeff; }
1033 }else{
1034 colorMatW =
conj(colorMatX)*
conj(colorMatY);
1035 if(oddBit){ mycoeff = -mycoeff; }
1037 }
1038 }
1039
1040 if(!Qprod){
1041 if(mu_positive){
1042 if(!oddBit){ mycoeff = -mycoeff; }
1044 }else{
1045 if(oddBit){ mycoeff = -mycoeff; }
1046 colorMatW =
conj(colorMatY);
1048 }
1049 } // if !(Qprod)
1050
1051 return;
1052 } // computeSideLinkSite
1053
1054
1055 // Maybe change to computeSideLinkField
1056 template<class Real>
1058 const Real*
const P3,
1059 const Real* const Qprod, // why?
1060 const Real* const link,
1062 Real coeff, Real accumu_coeff,
1063 Real* const shortP,
1064 Real* const newOprod
1065 )
1066 {
1067 // Need some way of setting half_volume
1068 int volume = 1;
1069 for(
int dir=0; dir<4; ++dir) volume *=
dim[dir];
1070 #ifdef MULTI_GPU
1071 const int loop_count =
Vh_ex;
1072 #else
1073 const int loop_count = volume/2;
1074 #endif
1076
1077 for(int site=0; site<loop_count; ++site){
1078 computeSideLinkSite<Real,0>(site,
dim,
1081 coeff, accumu_coeff,
1082 ls, shortP, newOprod);
1083 }
1084
1085 for(int site=0; site<loop_count; ++site){
1086 computeSideLinkSite<Real,1>(site,
dim,
1089 coeff, accumu_coeff,
1090 ls, shortP, newOprod);
1091 }
1092
1093 return;
1094 }
1095
1096
1097
1098
1099
1100 template<class Real, int oddBit>
1103 const Real* const oprod,
1104 const Real* const Qprev,
1105 const Real* const link,
1107 Real coeff, Real accumu_coeff,
1108 const LoadStore<Real>& ls,
// pass a function object to read from and write to matrix fields
1109 Real* const shortP,
1110 Real* const newOprod)
1111 {
1112
1114 const bool sig_positive = (
GOES_FORWARDS(sig)) ?
true :
false;
1115
1118
1119
1120 int ab_link_nbr_idx, point_b, point_c, point_d;
1121
1124
1125 int err;
1127 point_d = new_mem_idx >> 1;
1128
1130 point_c = new_mem_idx >> 1;
1131
1133 point_b = new_mem_idx >> 1;
1134 ab_link_nbr_idx = (sig_positive) ? half_lattice_index : point_b;
1135
1136
1137
1138 Real mycoeff = ( (sig_positive && oddBit) || (!sig_positive && !oddBit) ) ? coeff : -coeff;
1139
1140 if(mu_positive){
1143 // compute point_c
1146 colorMatZ =
conj(bc_link)*colorMatY;
// okay
1147
1148 if(sig_positive)
1149 {
1150 colorMatY = colorMatX*ad_link;
1151 colorMatW = colorMatZ*colorMatY;
1153 }
1154
1155 if(sig_positive){
1157 }else{
1159 }
1160 colorMatY = (sig_positive) ? ab_link*colorMatZ :
conj(ab_link)*colorMatZ;
// okay
1161 colorMatW = colorMatY*colorMatX;
1163 colorMatW = ad_link*colorMatY;
1164 ls.
addMatrixToField(colorMatW, 1-oddBit, point_d, accumu_coeff, shortP);
// Warning! Need to check this!
1165 }else{ //negative mu
1171
1172 if(sig_positive) colorMatW = colorMatX*
conj(ad_link);
1173 colorMatZ = bc_link*colorMatY;
1174 if(sig_positive){
1175 colorMatY = colorMatZ*colorMatW;
1177 }
1178
1179 if(sig_positive){
1181 }else{
1183 }
1184
1185 colorMatY = (sig_positive) ? ab_link*colorMatZ :
conj(ab_link)*colorMatZ;
// 611
1186 colorMatW =
conj(colorMatX)*
conj(colorMatY);
1187
1189 colorMatW =
conj(ad_link)*colorMatY;
1191 } // end mu
1192 return;
1193 } // allLinkKernel
1194
1195
1196
1197 template<class Real>
1199 const Real* const oprod,
1200 const Real* const Qprev,
1201 const Real* const link,
1203 Real coeff, Real accumu_coeff,
1204 Real* const shortP,
1205 Real* const newOprod)
1206 {
1207 int volume = 1;
1208 for(
int dir=0; dir<4; ++dir) volume *=
dim[dir];
1209 #ifdef MULTI_GPU
1210 const int loop_count =
Vh_ex;
1211 #else
1212 const int loop_count = volume/2;
1213 #endif
1214
1216 for(int site=0; site<loop_count; ++site){
1217
1218 computeAllLinkSite<Real,0>(site,
dim,
1219 oprod, Qprev, link,
1221 coeff, accumu_coeff,
1222 ls,
1223 shortP, newOprod);
1224 }
1225
1226 for(int site=0; site<loop_count; ++site){
1227 computeAllLinkSite<Real, 1>(site,
dim,
1228 oprod, Qprev, link,
1230 coeff, accumu_coeff,
1231 ls,
1232 shortP, newOprod);
1233 }
1234
1235 return;
1236 }
1237
1238 #define Pmu tempmat[0]
1239 #define P3 tempmat[1]
1240 #define P5 tempmat[2]
1241 #define Pnumu tempmat[3]
1242 #define Qmu tempmat[4]
1243 #define Qnumu tempmat[5]
1244
1245 template<class Real>
1247 {
1254 };
1255
1256
1257 template<class Real>
1260 Real* oprod,
1261 Real* link,
1262 Real** tempmat,
1263 Real* newOprod)
1264 {
1265 Real OneLink, ThreeSt, FiveSt, SevenSt, Lepage, coeff;
1266
1267 OneLink = staple_coeff.
one;
1268 ThreeSt = staple_coeff.
three;
1269 FiveSt = staple_coeff.
five;
1270 SevenSt = staple_coeff.
seven;
1271 Lepage = staple_coeff.
lepage;
1272
1273 for(int sig=0; sig<4; ++sig){
1275 oprod,
1276 sig, OneLink,
1277 newOprod);
1278 }
1279
1280
1281 // sig labels the net displacement of the staple
1282 for(int sig=0; sig<8; ++sig){
1283 for(
int mu=0;
mu<8; ++
mu){
1285
1286 computeMiddleLinkField<Real>(
dim,
1287 oprod, NULL, link,
1290 newOprod);
1291
1292 for(int nu=0; nu<8; ++nu){
1294 || nu==sig || nu==
OPP_DIR(sig) )
continue;
1295
1296 computeMiddleLinkField<Real>(
dim,
1298 sig, nu, staple_coeff.
five,
1300 newOprod);
1301
1302
1303 for(int rho=0; rho<8; ++rho){
1304 if( rho == sig || rho ==
OPP_DIR(sig)
1306 || rho == nu || rho ==
OPP_DIR(nu) )
1307 {
1308 continue;
1309 }
1310
1311 if(FiveSt != 0)coeff = SevenSt/FiveSt; else coeff = 0;
1312 computeAllLinkField<Real>(
dim,
1314 sig, rho, staple_coeff.
seven, coeff,
1316
1317 } // rho
1318
1319 // 5-staple: side link
1320 if(ThreeSt != 0)coeff = FiveSt/ThreeSt; else coeff = 0;
1321 computeSideLinkField<Real>(
dim,
1323 sig, nu, -FiveSt, coeff,
1325
1326
1327 } // nu
1328
1329
1330 // lepage
1331 if(staple_coeff.
lepage != 0.){
1332 computeMiddleLinkField<Real>(
dim,
1336 newOprod);
1337
1338 if(ThreeSt != 0)coeff = Lepage/ThreeSt; else coeff = 0;
1339 computeSideLinkField<Real>(
dim,
1341 sig,
mu, -Lepage, coeff,
1343 } // lepage != 0
1344
1345
1346 computeSideLinkField<Real>(
dim,
1348 sig,
mu, ThreeSt, 0.,
1349 NULL, newOprod);
1350 } // mu
1351 } // sig
1352
1353 // Need also to compute the one-link contribution
1354 return;
1355 }
1356
1357 #undef Pmu
1358 #undef P3
1359 #undef P5
1360 #undef Pnumu
1361 #undef Qmu
1362 #undef Qnumu
1363
1364
1370 {
1371 int volume = 1;
1372 for(
int dir=0; dir<4; ++dir) volume *=
param.
X[dir];
1373
1374 #ifdef MULTI_GPU
1376 #else
1377 int len = volume;
1378 #endif
1379 // allocate memory for temporary fields
1380 void* tempmat[6];
1382 for(int i=0; i<6; ++i) tempmat[i] = malloc(len*18*sizeof(double));
1383 }else{
1384 for(int i=0; i<6; ++i) tempmat[i] = malloc(len*18*sizeof(float));
1385 }
1386
1388 act_path_coeff.
one = path_coeff[0];
1389 act_path_coeff.
naik = path_coeff[1];
1390 act_path_coeff.
three = path_coeff[2];
1391 act_path_coeff.
five = path_coeff[3];
1392 act_path_coeff.
seven = path_coeff[4];
1393 act_path_coeff.
lepage = path_coeff[5];
1394
1396 doHisqStaplesForceCPU<double>(
param.
X,
1397 act_path_coeff,
1400 (double**)tempmat,
1402 );
1403
1405 doHisqStaplesForceCPU<float>(
param.
X,
1406 act_path_coeff,
1409 (float**)tempmat,
1411 );
1412 }else{
1414 }
1415
1416 for(int i=0; i<6; ++i){
1417 free(tempmat[i]);
1418 }
1419 return;
1420 }
1421
1422
1423
1424 template<class Real, int oddBit>
1427 const Real* const oprod,
1428 const Real* const link,
1429 int sig, Real coeff,
1431 Real* const output)
1432 {
1434
1436
1439
1440 int point_a, point_b, point_c, point_d, point_e;
1441 #ifdef MULTI_GPU
1443 #else
1444 int idx = half_lattice_index;
1445 #endif
1446
1448 point_c = idx;
1449
1451 point_d = new_mem_idx >> 1;
1452
1454 point_e = new_mem_idx >> 1;
1455
1457 point_b = new_mem_idx >> 1;
1458
1460 point_a = new_mem_idx >> 1;
1461
1466
1470
1471 colorMatV = de_link*ef_link*colorMatZ
1472 - de_link*colorMatY*bc_link
1473 + colorMatX*ab_link*bc_link;
1474
1476 }
1477 return;
1478 }
1479
1480 template<class Real>
1482 const Real* const oprod,
1483 const Real* const link,
1484 int sig, Real coeff,
1485 Real* const output)
1486 {
1487 int volume = 1;
1488 for(
int dir=0; dir<4; ++dir) volume *=
dim[dir];
1489 const int half_volume = volume/2;
1490
1492 for(int site=0; site<half_volume; ++site){
1493 computeLongLinkSite<Real,0>(site,
1495 oprod,
1496 link,
1497 sig, coeff, ls,
1498 output);
1499
1500 }
1501 // Loop over odd lattice sites
1502 for(int site=0; site<half_volume; ++site){
1503 computeLongLinkSite<Real,1>(site,
1505 oprod,
1506 link,
1507 sig, coeff, ls,
1508 output);
1509 }
1510 return;
1511 }
1512
1513
1519 {
1520 for(int sig=0; sig<4; ++sig){
1522 computeLongLinkField<float>(
param.
X,
1525 sig, coeff,
1528 computeLongLinkField<double>(
param.
X,
1531 sig, coeff,
1532 (
double*)newOprod->
Gauge_p());
1533 }else {
1535 }
1536 } // sig
1537 return;
1538 }
1539
1540
1541 template<class Real, int oddBit>
1544 const Real* const oprod,
1545 const Real* const link,
1546 int sig,
1548 Real* const mom)
1549 {
1550
1552
1553 #ifdef MULTI_GPU
1555 int idx = half_lattice_index_ex;
1556 #else
1557 int idx = half_lattice_index;
1558 #endif
1561
1562 const Real coeff = (oddBit) ? -1 : 1;
1563 colorMatY = linkW*colorMatX;
1564
1566 return;
1567 }
1568
1569 template <class Real>
1571 const Real* const oprod,
1572 const Real* const link,
1573 int sig,
1574 Real* const mom)
1575 {
1577 const int half_volume = volume/2;
1579
1580
1581 for(int site=0; site<half_volume; ++site){
1582 completeForceSite<Real,0>(site,
1584 oprod, link,
1585 sig,
1586 ls,
1587 mom);
1588
1589 }
1590 for(int site=0; site<half_volume; ++site){
1591 completeForceSite<Real,1>(site,
1593 oprod, link,
1594 sig,
1595 ls,
1596 mom);
1597 }
1598 return;
1599 }
1600
1601
1606 {
1607 for(int sig=0; sig<4; ++sig){
1609 completeForceField<float>(
param.
X,
1612 sig,
1615 completeForceField<double>(
param.
X,
1618 sig,
1620 }else{
1622 }
1623 } // loop over sig
1624 return;
1625 }
1626
1627
1628 //} // namespace quda
1629
1630
1631
1632
void storeMatrixToMomentumField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int dir, int half_lattice_index, Real coeff, Real *const) const
Real getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const
void loadMatrixFromField(const Real *const field, int oddBit, int half_lattice_index, Matrix< 3, std::complex< Real > > *const mat) const
int half_idx_conversion_ex2normal(int half_lattice_index, const int *dim, int oddBit) const
int half_idx_conversion_normal2ex(int half_lattice_index, const int *dim, int oddBit) const
void storeMatrixToField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int half_lattice_index, Real *const field) const
void addData(Real *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const
void addMatrixToField(const Matrix< 3, std::complex< Real > > &mat, int oddBit, int half_lattice_index, Real coeff, Real *const) const
T & operator()(int i, int j)
Matrix & operator-=(const Matrix< N, T > &mat)
Matrix & operator+=(const Matrix< N, T > &mat)
const T & operator()(int i, int j) const
__device__ __host__ Matrix()
__device__ __host__ T const & operator()(int i, int j) const
void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
void completeForceField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real *const mom)
void computeLongLinkField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, Real *const output)
void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)
void computeAllLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore< Real > &ls, Real *const shortP, Real *const newOprod)
void completeForceSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, int sig, const LoadStore< Real > &ls, Real *const mom)
void computeSideLinkSite(int half_lattice_index, const int dim[4], const Real *const P3, const Real *const Qprod, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, const LoadStore< Real > &ls, Real *const shortP, Real *const newOprod)
void doHisqStaplesForceCPU(const int dim[4], PathCoefficients< double > staple_coeff, Real *oprod, Real *link, Real **tempmat, Real *newOprod)
void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff, const LoadStore< Real > &ls, Real *const output)
void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam ¶m, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *newOprod)
void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)
Matrix< N, T > transpose(const Matrix< N, std::complex< T > > &mat)
void computeMiddleLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link, int sig, int mu, Real coeff, const LoadStore< Real > &ls, Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)
void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Real coeff, Real *const output)
void computeSideLinkField(const int dim[4], const Real *const P3, const Real *const Qprod, const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)
void hisqCompleteForceCPU(const QudaGaugeParam ¶m, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *mom)
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam ¶m, cpuGaugeField &oprod, cpuGaugeField &link, cpuGaugeField *newOprod)
void computeOneLinkSite(const int dim[4], int half_lattice_index, const Real *const oprod, int sig, Real coeff, const LoadStore< Real > &ls, Real *const output)
#define GOES_FORWARDS(dir)
__host__ __device__ ValueType conj(ValueType x)
__host__ __device__ float4 operator+=(float4 &x, const float4 &y)
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator*(const S &a, const ColorSpinor< Float, Nc, Ns > &x)
Compute the scalar-vector product y = a * x.
__host__ __device__ float4 operator-=(float4 &x, const float4 &y)
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator-(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor subtraction operator.
std::ostream & operator<<(std::ostream &output, const CloverFieldParam ¶m)
Main header file for the QUDA library.
Matrix< 3, std::complex< Real > > Type
Matrix< N, T > operator()() const
int getNeighborFromFullIndex(int full_lattice_index, int dir, int *err=NULL)
Locator(const int dim[4])
int getFullFromHalfIndex(int half_lattice_index)
Matrix< N, T > operator()() const