mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 20:14:32 +00:00 
			
		
		
		
	Offload more loops
This commit is contained in:
		| @@ -258,15 +258,16 @@ private: | ||||
|   CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO | ||||
|   CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO | ||||
|  | ||||
|  public: | ||||
|   // eventually these can be compressed into 6x6 blocks instead of the 12x12 | ||||
|   // using the DeGrand-Rossi basis for the gamma matrices | ||||
|   CloverFieldType fillCloverYZ(const GaugeLinkField &F) | ||||
|   { | ||||
|     CloverFieldType T(F.Grid()); | ||||
|     T = Zero(); | ||||
|     autoView(T_v,T,CpuWrite); | ||||
|     autoView(F_v,F,CpuRead); | ||||
|     thread_for(i, CloverTerm.Grid()->oSites(), | ||||
|     autoView(T_v,T,AcceleratorWrite); | ||||
|     autoView(F_v,F,AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     { | ||||
|       T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); | ||||
|       T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); | ||||
| @@ -282,9 +283,9 @@ private: | ||||
|     CloverFieldType T(F.Grid()); | ||||
|     T = Zero(); | ||||
|      | ||||
|     autoView(T_v, T,CpuWrite); | ||||
|     autoView(F_v, F,CpuRead); | ||||
|     thread_for(i, CloverTerm.Grid()->oSites(), | ||||
|     autoView(T_v, T,AcceleratorWrite); | ||||
|     autoView(F_v, F,AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     { | ||||
|       T_v[i]()(0, 1) = -F_v[i]()(); | ||||
|       T_v[i]()(1, 0) = F_v[i]()(); | ||||
| @@ -300,9 +301,9 @@ private: | ||||
|     CloverFieldType T(F.Grid()); | ||||
|     T = Zero(); | ||||
|  | ||||
|     autoView(T_v,T,CpuWrite); | ||||
|     autoView(F_v,F,CpuRead); | ||||
|     thread_for(i, CloverTerm.Grid()->oSites(), | ||||
|     autoView(T_v,T,AcceleratorWrite); | ||||
|     autoView(F_v,F,AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     { | ||||
|       T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); | ||||
|       T_v[i]()(1, 1) = timesI(F_v[i]()()); | ||||
| @@ -318,9 +319,9 @@ private: | ||||
|     CloverFieldType T(F.Grid()); | ||||
|     T = Zero(); | ||||
|  | ||||
|     autoView( T_v , T, CpuWrite); | ||||
|     autoView( F_v , F, CpuRead); | ||||
|     thread_for(i, CloverTerm.Grid()->oSites(), | ||||
|     autoView( T_v , T, AcceleratorWrite); | ||||
|     autoView( F_v , F, AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     { | ||||
|       T_v[i]()(0, 1) = timesI(F_v[i]()()); | ||||
|       T_v[i]()(1, 0) = timesI(F_v[i]()()); | ||||
| @@ -336,9 +337,9 @@ private: | ||||
|     CloverFieldType T(F.Grid()); | ||||
|     T = Zero(); | ||||
|      | ||||
|     autoView( T_v ,T,CpuWrite); | ||||
|     autoView( F_v ,F,CpuRead); | ||||
|     thread_for(i, CloverTerm.Grid()->oSites(), | ||||
|     autoView( T_v ,T,AcceleratorWrite); | ||||
|     autoView( F_v ,F,AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     { | ||||
|       T_v[i]()(0, 1) = -(F_v[i]()()); | ||||
|       T_v[i]()(1, 0) = (F_v[i]()()); | ||||
| @@ -355,9 +356,9 @@ private: | ||||
|  | ||||
|     T = Zero(); | ||||
|  | ||||
|     autoView( T_v , T,CpuWrite); | ||||
|     autoView( F_v , F,CpuRead); | ||||
|     thread_for(i, CloverTerm.Grid()->oSites(), | ||||
|     autoView( T_v , T,AcceleratorWrite); | ||||
|     autoView( F_v , F,AcceleratorRead); | ||||
|     accelerator_for(i, CloverTerm.Grid()->oSites(),1, | ||||
|     { | ||||
|       T_v[i]()(0, 0) = timesI(F_v[i]()()); | ||||
|       T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); | ||||
|   | ||||
| @@ -106,10 +106,10 @@ public: | ||||
| 			    const _SpinorField & phi, | ||||
| 			    int mu) | ||||
|   { | ||||
|     autoView( out_v, out, CpuWrite); | ||||
|     autoView( phi_v, phi, CpuRead); | ||||
|     autoView( Umu_v, Umu, CpuRead); | ||||
|     thread_for(sss,out.Grid()->oSites(),{ | ||||
|     autoView( out_v, out, AcceleratorWrite); | ||||
|     autoView( phi_v, phi, AcceleratorRead); | ||||
|     autoView( Umu_v, Umu, AcceleratorRead); | ||||
|     accelerator_for(sss,out.Grid()->oSites(),1,{ | ||||
| 	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); | ||||
|     }); | ||||
|   } | ||||
| @@ -192,10 +192,10 @@ public: | ||||
|     GaugeLinkField tmp(mat.Grid()); | ||||
|     tmp = Zero(); | ||||
|     { | ||||
|       autoView( tmp_v , tmp, CpuWrite); | ||||
|       autoView( Btilde_v , Btilde, CpuRead); | ||||
|       autoView( Atilde_v , Atilde, CpuRead); | ||||
|       thread_for(sss,tmp.Grid()->oSites(),{ | ||||
|       autoView( tmp_v , tmp, AcceleratorWrite); | ||||
|       autoView( Btilde_v , Btilde, AcceleratorRead); | ||||
|       autoView( Atilde_v , Atilde, AcceleratorRead); | ||||
|       accelerator_for(sss,tmp.Grid()->oSites(),1,{ | ||||
| 	  int sU=sss; | ||||
| 	  for(int s=0;s<Ls;s++){ | ||||
| 	    int sF = s+Ls*sU; | ||||
|   | ||||
| @@ -467,32 +467,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1, | ||||
|   conformable(_grid, q_in_1.Grid()); | ||||
|   conformable(_grid, q_in_2.Grid()); | ||||
|   conformable(_grid, q_out.Grid()); | ||||
| #if 0 | ||||
|   PropagatorField tmp1(_grid), tmp2(_grid); | ||||
|   q_out = Zero(); | ||||
|  | ||||
|   // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu). | ||||
|   // Inefficient comms method but not performance critical. | ||||
|   tmp1 = Cshift(q_in_1, mu, 1); | ||||
|   tmp2 = Cshift(q_in_2, mu, 1); | ||||
|   autoView( tmp1_v  ,  tmp1, CpuWrite); | ||||
|   autoView( tmp2_v  ,  tmp2, CpuWrite); | ||||
|   autoView( q_in_1_v,q_in_1, CpuRead); | ||||
|   autoView( q_in_2_v,q_in_2, CpuRead); | ||||
|   autoView( q_out_v , q_out, CpuRead); | ||||
|   autoView( Umu_v   ,   Umu, CpuRead); | ||||
|   thread_for(sU, Umu.Grid()->oSites(),{ | ||||
|       Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU], | ||||
| 					       q_in_2_v[sU], | ||||
| 					       q_out_v[sU], | ||||
| 					       Umu_v, sU, mu); | ||||
|       Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU], | ||||
| 					       tmp2_v[sU], | ||||
| 					       q_out_v[sU], | ||||
| 					       Umu_v, sU, mu); | ||||
|   }); | ||||
| #else | ||||
| #endif | ||||
|   assert(0); | ||||
| } | ||||
|  | ||||
|  | ||||
| @@ -508,62 +483,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, | ||||
| { | ||||
|   conformable(_grid, q_in.Grid()); | ||||
|   conformable(_grid, q_out.Grid()); | ||||
| #if 0 | ||||
|  | ||||
|   //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid); | ||||
|   Complex i(0.0,1.0); | ||||
|   PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid); | ||||
|   unsigned int tshift = (mu == Tp) ? 1 : 0; | ||||
|   unsigned int LLt    = GridDefaultLatt()[Tp]; | ||||
|  | ||||
|   q_out = Zero(); | ||||
|   LatticeInteger coords(_grid); | ||||
|   LatticeCoordinate(coords, Tp); | ||||
|  | ||||
|   // Need q(x + mu) and q(x - mu). | ||||
|   tmp    = Cshift(q_in, mu, 1); | ||||
|   tmpFwd = tmp*lattice_cmplx; | ||||
|   tmp    = lattice_cmplx*q_in; | ||||
|   tmpBwd = Cshift(tmp, mu, -1); | ||||
|  | ||||
|   autoView( coords_v , coords, CpuRead); | ||||
|   autoView( tmpFwd_v , tmpFwd, CpuRead); | ||||
|   autoView( tmpBwd_v , tmpBwd, CpuRead); | ||||
|   autoView( Umu_v    , Umu, CpuRead); | ||||
|   autoView( q_out_v  , q_out, CpuWrite); | ||||
|  | ||||
|   thread_for(sU, Umu.Grid()->oSites(), { | ||||
|  | ||||
|     // Compute the sequential conserved current insertion only if our simd | ||||
|     // object contains a timeslice we need. | ||||
|     vPredicate t_mask; | ||||
|     t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax)); | ||||
|     Integer timeSlices = Reduce(t_mask()); | ||||
|  | ||||
|     if (timeSlices > 0) { | ||||
|       Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU],  | ||||
| 					  q_out_v[sU],  | ||||
| 					  Umu_v, sU, mu, t_mask); | ||||
|     } | ||||
|  | ||||
|     // Repeat for backward direction. | ||||
|     t_mask()     = ((coords_v[sU] >= (tmin + tshift)) &&  | ||||
| 		    (coords_v[sU] <= (tmax + tshift))); | ||||
|      | ||||
|     //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	 | ||||
|     unsigned int t0 = 0; | ||||
|     if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 )); | ||||
|      | ||||
|     timeSlices = Reduce(t_mask()); | ||||
|  | ||||
|     if (timeSlices > 0) { | ||||
|       Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU],  | ||||
| 					  q_out_v[sU],  | ||||
| 					  Umu_v, sU, mu, t_mask); | ||||
|     } | ||||
|   }); | ||||
| #else | ||||
| #endif | ||||
|   assert(0); | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user