mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	revert Add/SubTimesI and prefetching in stencil
This reverts commit 9b2699226c.
			
			
This commit is contained in:
		| @@ -164,7 +164,12 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | ||||
|   if((!local)&&(!st.same_node[Dir]) ) {					    \ | ||||
|     LOAD_CHI(base);							                \ | ||||
|     MULT_2SPIN_1(Dir);					                    \ | ||||
|     PREFETCH_CHIMU(base);                                   \ | ||||
|     /* PREFETCH_GAUGE_L1(NxtDir); */                        \ | ||||
|     MULT_2SPIN_2;					                        \ | ||||
|     if (s == 0) {                                           \ | ||||
|       if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ | ||||
|     }                                                       \ | ||||
|     RECON;								                    \ | ||||
|     nmu++;								                    \ | ||||
|   } | ||||
| @@ -175,7 +180,12 @@ Author:  Nils Meyer  <nils.meyer@ur.de>  Regensburg University | ||||
|   if((!local)&&(!st.same_node[Dir]) ) {					    \ | ||||
|     LOAD_CHI(base);							                \ | ||||
|     MULT_2SPIN_1(Dir);					                    \ | ||||
|     PREFETCH_CHIMU(base);                                   \ | ||||
|     /* PREFETCH_GAUGE_L1(NxtDir); */                        \ | ||||
|     MULT_2SPIN_2;					                        \ | ||||
|     if (s == 0) {                                           \ | ||||
|       if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ | ||||
|     }                                                       \ | ||||
|     RECON;								                    \ | ||||
|     nmu++;								                    \ | ||||
|   } | ||||
|   | ||||
| @@ -445,21 +445,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
| #ifndef GRID_NVCC | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite); /* printf("."); */   return;} | ||||
|      //if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite); printf(".");  return;} | ||||
| #endif | ||||
|    } else if( interior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} | ||||
| #ifndef GRID_NVCC | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt); /* printf("-"); */   return;} | ||||
|      //if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);  printf("-");   return;} | ||||
| #endif | ||||
|    } else if( exterior ) {  | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;} | ||||
| #ifndef GRID_NVCC | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt); /* printf("+"); */   return;} | ||||
|      //if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt); printf("+");  return;} | ||||
| #endif | ||||
|    } | ||||
|    assert(0 && " Kernel optimisation case not covered "); | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/qcd/spin/TwoSpinor.h | ||||
|  | ||||
| @@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Normalisation alert; the g5   project is 1/2(1+-G5) | ||||
| // Normalisation alert; the g5   project is 1/2(1+-G5)  | ||||
| //                      the xyzt projects are (1+-Gxyzt) | ||||
| // | ||||
| // * xyzt project | ||||
| @@ -59,7 +59,7 @@ NAMESPACE_BEGIN(Grid); | ||||
| // | ||||
| // Both four spinor and two spinor result variants are provided. | ||||
| // | ||||
| // The four spinor project will be recursively provided to Lattice wide routines, and likely used in | ||||
| // The four spinor project will be recursively provided to Lattice wide routines, and likely used in  | ||||
| // the domain wall and mobius implementations. | ||||
| // | ||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -74,17 +74,13 @@ NAMESPACE_BEGIN(Grid); | ||||
| // To fail is not to err (Cryptic clue: suggest to Google SFINAE ;) ) | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjXp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin) | ||||
| { | ||||
|   //hspin(0)=fspin(0)+timesI(fspin(3)); | ||||
|   //hspin(1)=fspin(1)+timesI(fspin(2)); | ||||
|   hspin(0)=addTimesI(fspin(0), fspin(3)); | ||||
|   hspin(1)=addTimesI(fspin(1), fspin(2)); | ||||
|   hspin(0)=fspin(0)+timesI(fspin(3)); | ||||
|   hspin(1)=fspin(1)+timesI(fspin(2)); | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjXm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin) | ||||
| { | ||||
|   //hspin(0)=fspin(0)-timesI(fspin(3)); | ||||
|   //hspin(1)=fspin(1)-timesI(fspin(2)); | ||||
|   hspin(0)=subTimesI(fspin(0), fspin(3)); | ||||
|   hspin(1)=subTimesI(fspin(1), fspin(2)); | ||||
|   hspin(0)=fspin(0)-timesI(fspin(3)); | ||||
|   hspin(1)=fspin(1)-timesI(fspin(2)); | ||||
| } | ||||
|  | ||||
| //  0 0  0  -1  [0] -+ [3] | ||||
| @@ -109,18 +105,14 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
|  */ | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjZp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin) | ||||
| { | ||||
|   //hspin(0)=fspin(0)+timesI(fspin(2)); | ||||
|   //hspin(1)=fspin(1)-timesI(fspin(3)); | ||||
|   hspin(0)=addTimesI(fspin(0), fspin(2)); | ||||
|   hspin(1)=subTimesI(fspin(1), fspin(3)); | ||||
|   hspin(0)=fspin(0)+timesI(fspin(2)); | ||||
|   hspin(1)=fspin(1)-timesI(fspin(3)); | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjZm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   //hspin(0)=fspin(0)-timesI(fspin(2)); | ||||
|   //hspin(1)=fspin(1)+timesI(fspin(3)); | ||||
|   hspin(0)=subTimesI(fspin(0), fspin(2)); | ||||
|   hspin(1)=addTimesI(fspin(1), fspin(3)); | ||||
|   hspin(0)=fspin(0)-timesI(fspin(2)); | ||||
|   hspin(1)=fspin(1)+timesI(fspin(3)); | ||||
| } | ||||
| /*Gt | ||||
|  *  0 0  1  0 [0]+-[2] | ||||
| @@ -141,8 +133,8 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
|   hspin(1)=fspin(1)-fspin(3); | ||||
| } | ||||
| /*G5 | ||||
|  *  1 0  0  0 | ||||
|  *  0 1  0  0 | ||||
|  *  1 0  0  0  | ||||
|  *  0 1  0  0  | ||||
|  *  0 0 -1  0 | ||||
|  *  0 0  0 -1 | ||||
|  */ | ||||
| @@ -160,7 +152,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
|   hspin(0)=fspin(2); | ||||
|   hspin(1)=fspin(3); | ||||
| } | ||||
|  | ||||
|    | ||||
| //  template<class vtype> accelerator_inline void fspProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin) | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin) | ||||
| { | ||||
| @@ -210,20 +202,16 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   //fspin(2)-=timesI(hspin(1)); | ||||
|   //fspin(3)-=timesI(hspin(0)); | ||||
|   fspin(2)=subTimesI(fspin(2), hspin(1)); | ||||
|   fspin(3)=subTimesI(fspin(3), hspin(0)); | ||||
|   fspin(2)-=timesI(hspin(1)); | ||||
|   fspin(3)-=timesI(hspin(0)); | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   //fspin(2)+=timesI(hspin(1)); | ||||
|   //fspin(3)+=timesI(hspin(0)); | ||||
|   fspin(2)=addTimesI(fspin(2), hspin(1)); | ||||
|   fspin(3)=addTimesI(fspin(3), hspin(0)); | ||||
|   fspin(2)+=timesI(hspin(1)); | ||||
|   fspin(3)+=timesI(hspin(0)); | ||||
| } | ||||
|  | ||||
| //  0 0  0  -1  [0] -+ [3] | ||||
| @@ -291,20 +279,16 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   //fspin(2)-=timesI(hspin(0)); | ||||
|   //fspin(3)+=timesI(hspin(1)); | ||||
|   fspin(2)=subTimesI(fspin(2), hspin(0)); | ||||
|   fspin(3)=addTimesI(fspin(3), hspin(1)); | ||||
|   fspin(2)-=timesI(hspin(0)); | ||||
|   fspin(3)+=timesI(hspin(1)); | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   //fspin(2)+=timesI(hspin(0)); | ||||
|   //fspin(3)-=timesI(hspin(1)); | ||||
|   fspin(2)=addTimesI(fspin(2), hspin(0)); | ||||
|   fspin(3)=subTimesI(fspin(3), hspin(1)); | ||||
|   fspin(2)+=timesI(hspin(0)); | ||||
|   fspin(3)-=timesI(hspin(1)); | ||||
| } | ||||
| /*Gt | ||||
|  *  0 0  1  0 [0]+-[2] | ||||
| @@ -345,8 +329,8 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
|   fspin(3)-=hspin(1); | ||||
| } | ||||
| /*G5 | ||||
|  *  1 0  0  0 | ||||
|  *  0 1  0  0 | ||||
|  *  1 0  0  0  | ||||
|  *  0 1  0  0  | ||||
|  *  0 0 -1  0 | ||||
|  *  0 0  0 -1 | ||||
|  */ | ||||
| @@ -399,7 +383,7 @@ template<class rtype,class vtype> accelerator_inline void spProjXp (iScalar<rtyp | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjXp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjXp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -418,7 +402,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconXp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconXp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -436,7 +420,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconXp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconXp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -462,7 +446,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjXm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -484,7 +468,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconXm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -505,7 +489,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconXm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -531,7 +515,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjYp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -553,7 +537,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconYp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -574,7 +558,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconYp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -599,7 +583,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjYm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -621,7 +605,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconYm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -642,7 +626,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconYm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -667,7 +651,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjZp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -689,7 +673,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconZp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -710,7 +694,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconZp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -735,7 +719,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjZm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -757,7 +741,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconZm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -778,7 +762,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconZm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -803,7 +787,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjTp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -825,7 +809,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconTp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -846,7 +830,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconTp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -871,7 +855,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjTm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -893,7 +877,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconTm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -914,7 +898,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconTm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -939,7 +923,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProj5p(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -960,7 +944,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spRecon5p(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -981,7 +965,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -1006,7 +990,7 @@ template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inlin | ||||
| template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProj5p(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -1029,7 +1013,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0> accel | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProj5m(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -1050,7 +1034,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spRecon5m(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -1071,7 +1055,7 @@ template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accel | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| @@ -1097,7 +1081,7 @@ template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inlin | ||||
| template<class vtype,int N> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){ | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProj5m(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
|   | ||||
| @@ -442,59 +442,6 @@ struct TimesMinusI{ | ||||
|   } | ||||
| }; | ||||
|  | ||||
| // alternative implementation using fcadd | ||||
| // this is not optimal because we have  op1 = op2 + TimesMinusI(op3) = op2 - TimesI(op3)  etc | ||||
| // but ideally we have                  op1 = SubTimesI(op2,op3) | ||||
| // | ||||
| // makes performance worse in Benchmark_wilson using MPI | ||||
| // increases halogtime and gathertime | ||||
| /* | ||||
| struct TimesMinusI{ | ||||
|   // Complex float | ||||
|   inline vecf operator()(vecf a, vecf b){ | ||||
|     pred pg1 = acle<float>::pg1(); | ||||
|     vecf z_v = acle<float>::zero(); | ||||
|  | ||||
|     return svcadd_x(pg1, z_v, a, 270); | ||||
|   } | ||||
|   // Complex double | ||||
|   inline vecd operator()(vecd a, vecd b){ | ||||
|     pred pg1 = acle<double>::pg1(); | ||||
|     vecd z_v = acle<double>::zero(); | ||||
|  | ||||
|     return svcadd_x(pg1, z_v, a, 270); | ||||
|   } | ||||
| }; | ||||
| */ | ||||
|  | ||||
| // SVE only, fcadd returns  a +- i*b | ||||
| // a + i * b | ||||
| struct AddTimesI{ | ||||
|   // Complex float | ||||
|   inline vecf operator()(vecf a, vecf b){ | ||||
|     pred pg1 = acle<float>::pg1(); | ||||
|     return svcadd_x(pg1, a, b, 90); | ||||
|   } | ||||
|   // Complex double | ||||
|   inline vecd operator()(vecd a, vecd b){ | ||||
|     pred pg1 = acle<double>::pg1(); | ||||
|     return svcadd_x(pg1, a, b, 90); | ||||
|   } | ||||
| }; | ||||
| // a - i * b | ||||
| struct SubTimesI{ | ||||
|   // Complex float | ||||
|   inline vecf operator()(vecf a, vecf b){ | ||||
|     pred pg1 = acle<float>::pg1(); | ||||
|     return svcadd_x(pg1, a, b, 270); | ||||
|   } | ||||
|   // Complex double | ||||
|   inline vecd operator()(vecd a, vecd b){ | ||||
|     pred pg1 = acle<double>::pg1(); | ||||
|     return svcadd_x(pg1, a, b, 270); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| struct TimesI{ | ||||
|   // Complex float | ||||
|   inline vecf operator()(vecf a, vecf b){ | ||||
| @@ -518,33 +465,6 @@ struct TimesI{ | ||||
|   } | ||||
| }; | ||||
|  | ||||
|  | ||||
| // alternative implementation using fcadd | ||||
| // this is not optimal because we have  op1 = op2 + TimesI(op3)  etc | ||||
| // ideally we have                      op1 = AddTimesI(op2,op3) | ||||
| // | ||||
| // makes performance worse in Benchmark_wilson using MPI | ||||
| // increases halogtime and gathertime | ||||
| /* | ||||
| struct TimesI{ | ||||
|   // Complex float | ||||
|   inline vecf operator()(vecf a, vecf b){ | ||||
|     pred pg1 = acle<float>::pg1(); | ||||
|     vecf z_v = acle<float>::zero(); | ||||
|  | ||||
|     return svcadd_x(pg1, z_v, a, 90); | ||||
|   } | ||||
|   // Complex double | ||||
|   inline vecd operator()(vecd a, vecd b){ | ||||
|     pred pg1 = acle<double>::pg1(); | ||||
|     vecd z_v = acle<double>::zero(); | ||||
|  | ||||
|     return svcadd_x(pg1, z_v, a, 90); | ||||
|   } | ||||
| }; | ||||
| */ | ||||
|  | ||||
|  | ||||
| struct PrecisionChange { | ||||
|   static inline vech StoH (vecf sa, vecf sb) { | ||||
|     pred pg1s = acle<float>::pg1(); | ||||
| @@ -827,25 +747,6 @@ typedef veci SIMD_Itype; // Integer type | ||||
|  | ||||
| // prefetch utilities | ||||
| inline void v_prefetch0(int size, const char *ptr){}; | ||||
|  | ||||
| /* PF 256 | ||||
| inline void prefetch_HINT_T0(const char *ptr){ | ||||
|   static int64_t last_ptr; | ||||
|   int64_t vptr = reinterpret_cast<std::intptr_t>(ptr) & 0x7fffffffffffff00ll; | ||||
|   if (last_ptr != vptr) { | ||||
|     last_ptr = vptr; | ||||
|     pred pg1 = Optimization::acle<double>::pg1(); | ||||
|     svprfd(pg1, reinterpret_cast<int64_t*>(ptr), SV_PLDL1STRM); | ||||
|     svprfd(pg1, ptr, SV_PLDL1STRM); | ||||
|   } | ||||
| }; | ||||
| */ | ||||
| /* PF 64 | ||||
| inline void prefetch_HINT_T0(const char *ptr){ | ||||
|   pred pg1 = Optimization::acle<double>::pg1(); | ||||
|   svprfd(pg1, ptr, SV_PLDL1STRM); | ||||
| }; | ||||
| */ | ||||
| inline void prefetch_HINT_T0(const char *ptr){}; | ||||
|  | ||||
| // Function name aliases | ||||
| @@ -867,8 +768,5 @@ typedef Optimization::MaddRealPart   MaddRealPartSIMD; | ||||
| typedef Optimization::Conj           ConjSIMD; | ||||
| typedef Optimization::TimesMinusI    TimesMinusISIMD; | ||||
| typedef Optimization::TimesI         TimesISIMD; | ||||
| typedef Optimization::AddTimesI      AddTimesISIMD; | ||||
| typedef Optimization::SubTimesI      SubTimesISIMD; | ||||
|  | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|   | ||||
| @@ -298,7 +298,7 @@ public: | ||||
|  | ||||
|   // FIXME -- alias this to an accelerator_inline MAC struct. | ||||
|  | ||||
| // specialize mac for A64FX | ||||
|   // FIXME VLA build error | ||||
|   #if defined(A64FX) || defined(A64FXFIXEDSIZE) | ||||
|   friend accelerator_inline void mac(Grid_simd *__restrict__ y, | ||||
| 				     const Grid_simd *__restrict__ a, | ||||
| @@ -894,47 +894,6 @@ accelerator_inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) { | ||||
|   return in; | ||||
| } | ||||
|  | ||||
| // ----------------------------------------------------------------------------- | ||||
|  | ||||
| // SVE only | ||||
| /////////////////////// | ||||
| // AddTimesI | ||||
| /////////////////////// | ||||
| template <class S, class V, IfComplex<S> = 0> | ||||
| accelerator_inline void addTimesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) { | ||||
|   ret.v = binary<V>(in1.v, in2.v, AddTimesISIMD()); | ||||
| } | ||||
| template <class S, class V, IfComplex<S> = 0> | ||||
| accelerator_inline Grid_simd<S, V> addTimesI(const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) { | ||||
|   Grid_simd<S, V> ret; | ||||
|   ret = addTimesI(in1, in2); | ||||
|   return ret; | ||||
| } | ||||
| template <class S, class V, IfNotComplex<S> = 0> | ||||
| accelerator_inline Grid_simd<S, V> addTimesI(const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) { | ||||
|   return in1; | ||||
| } | ||||
| /////////////////////// | ||||
| // SubTimesI | ||||
| /////////////////////// | ||||
| template <class S, class V, IfComplex<S> = 0> | ||||
| accelerator_inline void subTimesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) { | ||||
|   ret.v = binary<V>(in1.v, in2.v, SubTimesISIMD()); | ||||
| } | ||||
| template <class S, class V, IfComplex<S> = 0> | ||||
| accelerator_inline Grid_simd<S, V> subTimesI(const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) { | ||||
|   Grid_simd<S, V> ret; | ||||
|   ret = subTimesI(in1, in2); | ||||
|   return ret; | ||||
| } | ||||
| template <class S, class V, IfNotComplex<S> = 0> | ||||
| accelerator_inline Grid_simd<S, V> subTimesI(const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) { | ||||
|   return in1; | ||||
| } | ||||
|  | ||||
| // end SVE | ||||
| // ----------------------------------------------------------------------------- | ||||
|  | ||||
| ///////////////////// | ||||
| // Inner, outer | ||||
| ///////////////////// | ||||
|   | ||||
| @@ -68,27 +68,8 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice | ||||
|   int num=table.size(); | ||||
|   std::pair<int,int> *table_v = & table[0]; | ||||
|   auto rhs_v = rhs.View(); | ||||
|  | ||||
|   // main loop | ||||
|   accelerator_forNB( i,num, vobj::Nsimd(), { | ||||
|     typedef decltype(coalescedRead(buffer[0])) compressed_t; | ||||
|     // prefetching: | ||||
|     // +1% performance for Wilson on 32**4 | ||||
|     // -2% performance for DW on 24**4 x 12 | ||||
|       | ||||
|     const int dist = 7; | ||||
|     if (i+dist < num){ | ||||
|       svbool_t pg1 = svptrue_b64(); | ||||
|  | ||||
|       // prefetch input | ||||
|       auto in = rhs_v(so+table_v[i+dist].second); | ||||
|       svprfd(pg1, (char*)&in, SV_PLDL2STRM); | ||||
|  | ||||
|       // prefetch store buffer | ||||
|       uint64_t o = table_v[i+dist].first; | ||||
|       svprfd(pg1, (char*)&buffer[off+o], SV_PSTL2STRM); | ||||
|     } | ||||
|  | ||||
|     compressed_t   tmp_c; | ||||
|     uint64_t o = table_v[i].first; | ||||
|     compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/tensors/Tensor_reality.h | ||||
|  | ||||
| @@ -31,16 +31,16 @@ Author: neo <cossu@post.kek.jp> | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| /////////////////////////////////////////////// | ||||
| ///////////////////////////////////////////////  | ||||
| // multiply by I; make recursive. | ||||
| /////////////////////////////////////////////// | ||||
| template<class vtype> accelerator_inline iScalar<vtype> timesI(const iScalar<vtype>&r) | ||||
| ///////////////////////////////////////////////  | ||||
| template<class vtype> accelerator_inline iScalar<vtype> timesI(const iScalar<vtype>&r)  | ||||
| { | ||||
|   iScalar<vtype> ret; | ||||
|   timesI(ret._internal,r._internal); | ||||
|   return ret; | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline iVector<vtype,N> timesI(const iVector<vtype,N>&r) | ||||
| template<class vtype,int N> accelerator_inline iVector<vtype,N> timesI(const iVector<vtype,N>&r)  | ||||
| { | ||||
|   iVector<vtype,N> ret; | ||||
|   for(int i=0;i<N;i++){ | ||||
| @@ -58,11 +58,11 @@ template<class vtype,int N> accelerator_inline iMatrix<vtype,N> timesI(const iMa | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| template<class vtype> accelerator_inline void timesI(iScalar<vtype> &ret,const iScalar<vtype>&r) | ||||
| template<class vtype> accelerator_inline void timesI(iScalar<vtype> &ret,const iScalar<vtype>&r)  | ||||
| { | ||||
|   timesI(ret._internal,r._internal); | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline void timesI(iVector<vtype,N> &ret,const iVector<vtype,N>&r) | ||||
| template<class vtype,int N> accelerator_inline void timesI(iVector<vtype,N> &ret,const iVector<vtype,N>&r)  | ||||
| { | ||||
|   for(int i=0;i<N;i++){ | ||||
|     timesI(ret._internal[i],r._internal[i]); | ||||
| @@ -77,13 +77,13 @@ template<class vtype,int N> accelerator_inline void  timesI(iMatrix<vtype,N> &re | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class vtype> accelerator_inline iScalar<vtype> timesMinusI(const iScalar<vtype>&r) | ||||
| template<class vtype> accelerator_inline iScalar<vtype> timesMinusI(const iScalar<vtype>&r)  | ||||
| { | ||||
|   iScalar<vtype> ret; | ||||
|   timesMinusI(ret._internal,r._internal); | ||||
|   return ret; | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline iVector<vtype,N> timesMinusI(const iVector<vtype,N>&r) | ||||
| template<class vtype,int N> accelerator_inline iVector<vtype,N> timesMinusI(const iVector<vtype,N>&r)  | ||||
| { | ||||
|   iVector<vtype,N> ret; | ||||
|   for(int i=0;i<N;i++){ | ||||
| @@ -101,11 +101,11 @@ template<class vtype,int N> accelerator_inline iMatrix<vtype,N> timesMinusI(cons | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| template<class vtype>  accelerator_inline void timesMinusI(iScalar<vtype> &ret,const iScalar<vtype>&r) | ||||
| template<class vtype>  accelerator_inline void timesMinusI(iScalar<vtype> &ret,const iScalar<vtype>&r)  | ||||
| { | ||||
|   timesMinusI(ret._internal,r._internal); | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline void timesMinusI(iVector<vtype,N> &ret,const iVector<vtype,N>&r) | ||||
| template<class vtype,int N> accelerator_inline void timesMinusI(iVector<vtype,N> &ret,const iVector<vtype,N>&r)  | ||||
| { | ||||
|   for(int i=0;i<N;i++){ | ||||
|     timesMinusI(ret._internal[i],r._internal[i]); | ||||
| @@ -120,99 +120,9 @@ template<class vtype,int N> accelerator_inline void  timesMinusI(iMatrix<vtype,N | ||||
| } | ||||
|  | ||||
|  | ||||
| // ----------------------------------------------------------------------------- | ||||
| // SVE | ||||
|  | ||||
| template<class vtype> accelerator_inline iScalar<vtype> addTimesI(const iScalar<vtype>&r1, const iScalar<vtype>&r2) | ||||
| { | ||||
|   iScalar<vtype> ret; | ||||
|   addTimesI(ret._internal,r1._internal,r2._internal); | ||||
|   return ret; | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline iVector<vtype,N> addTimesI(const iVector<vtype,N>&r1, const iVector<vtype,N>&r2) | ||||
| { | ||||
|   iVector<vtype,N> ret; | ||||
|   for(int i=0;i<N;i++){ | ||||
|     addTimesI(ret._internal[i],r1._internal[i],r2._internal[i]); | ||||
|   } | ||||
|   return ret; | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline iMatrix<vtype,N> addTimesI(const iMatrix<vtype,N>&r1, const iMatrix<vtype,N>&r2) | ||||
| { | ||||
|   iMatrix<vtype,N> ret; | ||||
|   for(int i=0;i<N;i++){ | ||||
|     for(int j=0;j<N;j++){ | ||||
|       addTimesI(ret._internal[i][j],r1._internal[i][j],r2._internal[i][j]); | ||||
|     }} | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| template<class vtype> accelerator_inline void addTimesI(iScalar<vtype> &ret,const iScalar<vtype>&r1,const iScalar<vtype>&r2) | ||||
| { | ||||
|   addTimesI(ret._internal,r1._internal,r2._internal); | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline void addTimesI(iVector<vtype,N> &ret,const iVector<vtype,N>&r1,const iVector<vtype,N>&r2) | ||||
| { | ||||
|   for(int i=0;i<N;i++){ | ||||
|     addTimesI(ret._internal[i],r1._internal[i],r2._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline void addTimesI(iMatrix<vtype,N> &ret,const iMatrix<vtype,N>&r1,const iMatrix<vtype,N>&r2) | ||||
| { | ||||
|   for(int i=0;i<N;i++){ | ||||
|     for(int j=0;j<N;j++){ | ||||
|       addTimesI(ret._internal[i][j],r1._internal[i][j],r2._internal[i][j]); | ||||
|     }} | ||||
| } | ||||
|  | ||||
| template<class vtype> accelerator_inline iScalar<vtype> subTimesI(const iScalar<vtype>&r1, const iScalar<vtype>&r2) | ||||
| { | ||||
|   iScalar<vtype> ret; | ||||
|   subTimesI(ret._internal,r1._internal,r2._internal); | ||||
|   return ret; | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline iVector<vtype,N> subTimesI(const iVector<vtype,N>&r1, const iVector<vtype,N>&r2) | ||||
| { | ||||
|   iVector<vtype,N> ret; | ||||
|   for(int i=0;i<N;i++){ | ||||
|     subTimesI(ret._internal[i],r1._internal[i],r2._internal[i]); | ||||
|   } | ||||
|   return ret; | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline iMatrix<vtype,N> subTimesI(const iMatrix<vtype,N>&r1, const iMatrix<vtype,N>&r2) | ||||
| { | ||||
|   iMatrix<vtype,N> ret; | ||||
|   for(int i=0;i<N;i++){ | ||||
|     for(int j=0;j<N;j++){ | ||||
|       subTimesI(ret._internal[i][j],r1._internal[i][j],r2._internal[i][j]); | ||||
|     }} | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| template<class vtype> accelerator_inline void subTimesI(iScalar<vtype> &ret,const iScalar<vtype>&r1,const iScalar<vtype>&r2) | ||||
| { | ||||
|   subTimesI(ret._internal,r1._internal,r2._internal); | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline void subTimesI(iVector<vtype,N> &ret,const iVector<vtype,N>&r1,const iVector<vtype,N>&r2) | ||||
| { | ||||
|   for(int i=0;i<N;i++){ | ||||
|     subTimesI(ret._internal[i],r1._internal[i],r2._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class vtype,int N> accelerator_inline void subTimesI(iMatrix<vtype,N> &ret,const iMatrix<vtype,N>&r1,const iMatrix<vtype,N>&r2) | ||||
| { | ||||
|   for(int i=0;i<N;i++){ | ||||
|     for(int j=0;j<N;j++){ | ||||
|       subTimesI(ret._internal[i][j],r1._internal[i][j],r2._internal[i][j]); | ||||
|     }} | ||||
| } | ||||
| // ----------------------------------------------------------------------------- | ||||
| // end SVE | ||||
|  | ||||
|  | ||||
| /////////////////////////////////////////////// | ||||
| ///////////////////////////////////////////////  | ||||
| // Conj function for scalar, vector, matrix | ||||
| /////////////////////////////////////////////// | ||||
| ///////////////////////////////////////////////  | ||||
| template<class vtype> accelerator_inline iScalar<vtype> conjugate(const iScalar<vtype>&r) | ||||
| { | ||||
|   iScalar<vtype> ret; | ||||
| @@ -237,9 +147,9 @@ template<class vtype,int N> accelerator_inline iMatrix<vtype,N> conjugate(const | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| /////////////////////////////////////////////// | ||||
| ///////////////////////////////////////////////  | ||||
| // Adj function for scalar, vector, matrix | ||||
| /////////////////////////////////////////////// | ||||
| ///////////////////////////////////////////////  | ||||
| template<class vtype> accelerator_inline iScalar<vtype> adj(const iScalar<vtype>&r) | ||||
| { | ||||
|   iScalar<vtype> ret; | ||||
| @@ -296,7 +206,7 @@ template<class itype,int N> accelerator_inline auto real(const iVector<itype,N> | ||||
|   } | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
|      | ||||
| template<class itype> accelerator_inline auto imag(const iScalar<itype> &z) -> iScalar<decltype(imag(z._internal))> | ||||
| { | ||||
|   iScalar<decltype(imag(z._internal))> ret; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user