|  |  |  | @@ -34,8 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | namespace Grid { | 
		
	
		
			
				|  |  |  |  | namespace QCD { | 
		
	
		
			
				|  |  |  |  |   /* | 
		
	
		
			
				|  |  |  |  | namespace QCD {  /* | 
		
	
		
			
				|  |  |  |  |    * Dense matrix versions of routines | 
		
	
		
			
				|  |  |  |  |    */ | 
		
	
		
			
				|  |  |  |  | template<class Impl> | 
		
	
	
		
			
				
					
					|  |  |  | @@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP | 
		
	
		
			
				|  |  |  |  |       for(int v=0;v<LLs;v++){ | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	vprefetch(psi[ss+v+LLs]); | 
		
	
		
			
				|  |  |  |  | 	//	vprefetch(phi[ss+v+LLs]); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	int vp= (v==LLs-1) ? 0     : v+1; | 
		
	
		
			
				|  |  |  |  | 	int vm= (v==0    ) ? LLs-1 : v-1; | 
		
	
	
		
			
				
					
					|  |  |  | @@ -145,9 +143,6 @@ PARALLEL_FOR_LOOP | 
		
	
		
			
				|  |  |  |  | 	Simd hm_11 = psi[ss+vm]()(1)(1);  | 
		
	
		
			
				|  |  |  |  | 	Simd hm_12 = psi[ss+vm]()(1)(2);  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	if ( vp<=v ) { | 
		
	
		
			
				|  |  |  |  | 	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v); | 
		
	
		
			
				|  |  |  |  | 	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); | 
		
	
	
		
			
				
					
					|  |  |  | @@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP | 
		
	
		
			
				|  |  |  |  | 	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); | 
		
	
		
			
				|  |  |  |  | 	} | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	/* | 
		
	
		
			
				|  |  |  |  | 	if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl; | 
		
	
		
			
				|  |  |  |  | 	if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl; | 
		
	
		
			
				|  |  |  |  | 	if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl; | 
		
	
		
			
				|  |  |  |  | 	if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl; | 
		
	
		
			
				|  |  |  |  | 	*/	 | 
		
	
		
			
				|  |  |  |  | 	Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00;  | 
		
	
		
			
				|  |  |  |  | 	Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01;  | 
		
	
		
			
				|  |  |  |  | 	Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02;  | 
		
	
		
			
				|  |  |  |  | 	Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10;  | 
		
	
		
			
				|  |  |  |  | 	Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11;  | 
		
	
		
			
				|  |  |  |  | 	Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12;  | 
		
	
		
			
				|  |  |  |  | 	Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00;  | 
		
	
		
			
				|  |  |  |  | 	Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01;  | 
		
	
		
			
				|  |  |  |  | 	Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;   | 
		
	
		
			
				|  |  |  |  | 	Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10;  | 
		
	
		
			
				|  |  |  |  | 	Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11;  | 
		
	
		
			
				|  |  |  |  | 	Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12;  | 
		
	
		
			
				|  |  |  |  | 	// Can force these to real arithmetic and save 2x. | 
		
	
		
			
				|  |  |  |  | 	Simd p_00  = real_mult(d[v]()()(), phi[ss+v]()(0)(0))  + real_mult(l[v]()()(),hm_00);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_01  = real_mult(d[v]()()(), phi[ss+v]()(0)(1))  + real_mult(l[v]()()(),hm_01);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_02  = real_mult(d[v]()()(), phi[ss+v]()(0)(2))  + real_mult(l[v]()()(),hm_02);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_10  = real_mult(d[v]()()(), phi[ss+v]()(1)(0))  + real_mult(l[v]()()(),hm_10);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_11  = real_mult(d[v]()()(), phi[ss+v]()(1)(1))  + real_mult(l[v]()()(),hm_11);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_12  = real_mult(d[v]()()(), phi[ss+v]()(1)(2))  + real_mult(l[v]()()(),hm_12);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_20  = real_mult(d[v]()()(), phi[ss+v]()(2)(0))  + real_mult(u[v]()()(),hp_00);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_21  = real_mult(d[v]()()(), phi[ss+v]()(2)(1))  + real_mult(u[v]()()(),hp_01);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_22  = real_mult(d[v]()()(), phi[ss+v]()(2)(2))  + real_mult(u[v]()()(),hp_02);   | 
		
	
		
			
				|  |  |  |  | 	Simd p_30  = real_mult(d[v]()()(), phi[ss+v]()(3)(0))  + real_mult(u[v]()()(),hp_10);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_31  = real_mult(d[v]()()(), phi[ss+v]()(3)(1))  + real_mult(u[v]()()(),hp_11);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_32  = real_mult(d[v]()()(), phi[ss+v]()(3)(2))  + real_mult(u[v]()()(),hp_12);  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	 | 
		
	
		
			
				|  |  |  |  | 	//	if ( ss==0){ | 
		
	
		
			
				|  |  |  |  | 	/* | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl; | 
		
	
		
			
				|  |  |  |  | 	} | 
		
	
		
			
				|  |  |  |  | 	*/ | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(0)(0),p_00); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(0)(1),p_01); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(0)(2),p_02); | 
		
	
	
		
			
				
					
					|  |  |  | @@ -261,7 +234,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi, | 
		
	
		
			
				|  |  |  |  |   M5Dtime-=usecond(); | 
		
	
		
			
				|  |  |  |  | PARALLEL_FOR_LOOP | 
		
	
		
			
				|  |  |  |  |   for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | #if 0 | 
		
	
		
			
				|  |  |  |  |     alignas(64) SiteHalfSpinor hp; | 
		
	
		
			
				|  |  |  |  |     alignas(64) SiteHalfSpinor hm; | 
		
	
		
			
				|  |  |  |  |     alignas(64) SiteSpinor fp; | 
		
	
	
		
			
				
					
					|  |  |  | @@ -287,9 +260,231 @@ PARALLEL_FOR_LOOP | 
		
	
		
			
				|  |  |  |  |       chi[ss+v] = chi[ss+v]     +l[v]*fm; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     } | 
		
	
		
			
				|  |  |  |  | #else | 
		
	
		
			
				|  |  |  |  |       for(int v=0;v<LLs;v++){ | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	vprefetch(psi[ss+v+LLs]); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	int vp= (v==LLs-1) ? 0     : v+1; | 
		
	
		
			
				|  |  |  |  | 	int vm= (v==0    ) ? LLs-1 : v-1; | 
		
	
		
			
				|  |  |  |  | 	 | 
		
	
		
			
				|  |  |  |  | 	Simd hp_00 = psi[ss+vp]()(0)(0);  | 
		
	
		
			
				|  |  |  |  | 	Simd hp_01 = psi[ss+vp]()(0)(1);  | 
		
	
		
			
				|  |  |  |  | 	Simd hp_02 = psi[ss+vp]()(0)(2);  | 
		
	
		
			
				|  |  |  |  | 	Simd hp_10 = psi[ss+vp]()(1)(0);  | 
		
	
		
			
				|  |  |  |  | 	Simd hp_11 = psi[ss+vp]()(1)(1);  | 
		
	
		
			
				|  |  |  |  | 	Simd hp_12 = psi[ss+vp]()(1)(2);  | 
		
	
		
			
				|  |  |  |  | 	 | 
		
	
		
			
				|  |  |  |  | 	Simd hm_00 = psi[ss+vm]()(2)(0);  | 
		
	
		
			
				|  |  |  |  | 	Simd hm_01 = psi[ss+vm]()(2)(1);  | 
		
	
		
			
				|  |  |  |  | 	Simd hm_02 = psi[ss+vm]()(2)(2);  | 
		
	
		
			
				|  |  |  |  | 	Simd hm_10 = psi[ss+vm]()(3)(0);  | 
		
	
		
			
				|  |  |  |  | 	Simd hm_11 = psi[ss+vm]()(3)(1);  | 
		
	
		
			
				|  |  |  |  | 	Simd hm_12 = psi[ss+vm]()(3)(2);  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	if ( vp<=v ) { | 
		
	
		
			
				|  |  |  |  | 	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v); | 
		
	
		
			
				|  |  |  |  | 	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); | 
		
	
		
			
				|  |  |  |  | 	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); | 
		
	
		
			
				|  |  |  |  | 	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); | 
		
	
		
			
				|  |  |  |  | 	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); | 
		
	
		
			
				|  |  |  |  | 	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); | 
		
	
		
			
				|  |  |  |  | 	} | 
		
	
		
			
				|  |  |  |  | 	if ( vm>=v ) { | 
		
	
		
			
				|  |  |  |  | 	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); | 
		
	
		
			
				|  |  |  |  | 	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); | 
		
	
		
			
				|  |  |  |  | 	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); | 
		
	
		
			
				|  |  |  |  | 	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); | 
		
	
		
			
				|  |  |  |  | 	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); | 
		
	
		
			
				|  |  |  |  | 	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); | 
		
	
		
			
				|  |  |  |  | 	} | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	Simd p_00  = real_mult(d[v]()()(), phi[ss+v]()(0)(0))  + real_mult(u[v]()()(),hp_00);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_01  = real_mult(d[v]()()(), phi[ss+v]()(0)(1))  + real_mult(u[v]()()(),hp_01);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_02  = real_mult(d[v]()()(), phi[ss+v]()(0)(2))  + real_mult(u[v]()()(),hp_02);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_10  = real_mult(d[v]()()(), phi[ss+v]()(1)(0))  + real_mult(u[v]()()(),hp_10);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_11  = real_mult(d[v]()()(), phi[ss+v]()(1)(1))  + real_mult(u[v]()()(),hp_11);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_12  = real_mult(d[v]()()(), phi[ss+v]()(1)(2))  + real_mult(u[v]()()(),hp_12);  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	Simd p_20  = real_mult(d[v]()()(), phi[ss+v]()(2)(0))  + real_mult(l[v]()()(),hm_00);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_21  = real_mult(d[v]()()(), phi[ss+v]()(2)(1))  + real_mult(l[v]()()(),hm_01);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_22  = real_mult(d[v]()()(), phi[ss+v]()(2)(2))  + real_mult(l[v]()()(),hm_02);   | 
		
	
		
			
				|  |  |  |  | 	Simd p_30  = real_mult(d[v]()()(), phi[ss+v]()(3)(0))  + real_mult(l[v]()()(),hm_10);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_31  = real_mult(d[v]()()(), phi[ss+v]()(3)(1))  + real_mult(l[v]()()(),hm_11);  | 
		
	
		
			
				|  |  |  |  | 	Simd p_32  = real_mult(d[v]()()(), phi[ss+v]()(3)(2))  + real_mult(l[v]()()(),hm_12);  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(0)(0),p_00); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(0)(1),p_01); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(0)(2),p_02); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(1)(0),p_10); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(1)(1),p_11); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(1)(2),p_12); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(2)(0),p_20); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(2)(1),p_21); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(2)(2),p_22); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(3)(0),p_30); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(3)(1),p_31); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[ss+v]()(3)(2),p_32); | 
		
	
		
			
				|  |  |  |  |       } | 
		
	
		
			
				|  |  |  |  | #endif | 
		
	
		
			
				|  |  |  |  |   } | 
		
	
		
			
				|  |  |  |  |   M5Dtime+=usecond(); | 
		
	
		
			
				|  |  |  |  | } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | #include <simd/Intel512common.h> | 
		
	
		
			
				|  |  |  |  | #include <simd/Intel512avx.h> | 
		
	
		
			
				|  |  |  |  | #include <simd/Intel512single.h> | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | template<class Impl> | 
		
	
		
			
				|  |  |  |  | void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi, | 
		
	
		
			
				|  |  |  |  | 					     int LLs, int site, | 
		
	
		
			
				|  |  |  |  | 					     Vector<iSinglet<Simd> > &Matp, | 
		
	
		
			
				|  |  |  |  | 					     Vector<iSinglet<Simd> > &Matm) | 
		
	
		
			
				|  |  |  |  | { | 
		
	
		
			
				|  |  |  |  | #if 0 | 
		
	
		
			
				|  |  |  |  |   { | 
		
	
		
			
				|  |  |  |  |   SiteHalfSpinor BcastP; | 
		
	
		
			
				|  |  |  |  |   SiteHalfSpinor BcastM; | 
		
	
		
			
				|  |  |  |  |   SiteHalfSpinor SiteChiP; | 
		
	
		
			
				|  |  |  |  |   SiteHalfSpinor SiteChiM; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |   // Ls*Ls * 2 * 12 * vol flops | 
		
	
		
			
				|  |  |  |  |   for(int s1=0;s1<LLs;s1++){  | 
		
	
		
			
				|  |  |  |  |     for(int s2=0;s2<LLs;s2++){  | 
		
	
		
			
				|  |  |  |  |       for(int  l=0; l<Simd::Nsimd();l++){ // simd lane | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |         int s=s2+l*LLs; | 
		
	
		
			
				|  |  |  |  | 	int lex=s2+LLs*site; | 
		
	
		
			
				|  |  |  |  | 	 | 
		
	
		
			
				|  |  |  |  | 	if ( s2==0 && l==0) { | 
		
	
		
			
				|  |  |  |  | 	  SiteChiP=zero; | 
		
	
		
			
				|  |  |  |  | 	  SiteChiM=zero; | 
		
	
		
			
				|  |  |  |  | 	} | 
		
	
		
			
				|  |  |  |  | 	 | 
		
	
		
			
				|  |  |  |  | 	for(int sp=0;sp<2;sp++){ | 
		
	
		
			
				|  |  |  |  |         for(int co=0;co<Nc;co++){ | 
		
	
		
			
				|  |  |  |  | 	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l); | 
		
	
		
			
				|  |  |  |  | 	}} | 
		
	
		
			
				|  |  |  |  | 	for(int sp=0;sp<2;sp++){ | 
		
	
		
			
				|  |  |  |  |         for(int co=0;co<Nc;co++){ | 
		
	
		
			
				|  |  |  |  | 	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l); | 
		
	
		
			
				|  |  |  |  | 	}} | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	for(int sp=0;sp<2;sp++){ | 
		
	
		
			
				|  |  |  |  |         for(int co=0;co<Nc;co++){ | 
		
	
		
			
				|  |  |  |  | 	  SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us. | 
		
	
		
			
				|  |  |  |  | 	  SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out | 
		
	
		
			
				|  |  |  |  | 	}} | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     }} | 
		
	
		
			
				|  |  |  |  |     { | 
		
	
		
			
				|  |  |  |  |       int lex = s1+LLs*site; | 
		
	
		
			
				|  |  |  |  |       for(int sp=0;sp<2;sp++){ | 
		
	
		
			
				|  |  |  |  |       for(int co=0;co<Nc;co++){ | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co)); | 
		
	
		
			
				|  |  |  |  | 	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co)); | 
		
	
		
			
				|  |  |  |  |       }} | 
		
	
		
			
				|  |  |  |  |     } | 
		
	
		
			
				|  |  |  |  |   } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |   } | 
		
	
		
			
				|  |  |  |  | #else | 
		
	
		
			
				|  |  |  |  |   { | 
		
	
		
			
				|  |  |  |  |   // pointers | 
		
	
		
			
				|  |  |  |  |     //  MASK_REGS; | 
		
	
		
			
				|  |  |  |  | #define Chi_00 %%zmm1 | 
		
	
		
			
				|  |  |  |  | #define Chi_01 %%zmm2 | 
		
	
		
			
				|  |  |  |  | #define Chi_02 %%zmm3 | 
		
	
		
			
				|  |  |  |  | #define Chi_10 %%zmm4 | 
		
	
		
			
				|  |  |  |  | #define Chi_11 %%zmm5 | 
		
	
		
			
				|  |  |  |  | #define Chi_12 %%zmm6 | 
		
	
		
			
				|  |  |  |  | #define Chi_20 %%zmm7 | 
		
	
		
			
				|  |  |  |  | #define Chi_21 %%zmm8 | 
		
	
		
			
				|  |  |  |  | #define Chi_22 %%zmm9 | 
		
	
		
			
				|  |  |  |  | #define Chi_30 %%zmm10 | 
		
	
		
			
				|  |  |  |  | #define Chi_31 %%zmm11 | 
		
	
		
			
				|  |  |  |  | #define Chi_32 %%zmm12 | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | #define BCAST0   %%zmm13 | 
		
	
		
			
				|  |  |  |  | #define BCAST1   %%zmm14 | 
		
	
		
			
				|  |  |  |  | #define BCAST2   %%zmm15 | 
		
	
		
			
				|  |  |  |  | #define BCAST3   %%zmm16 | 
		
	
		
			
				|  |  |  |  | #define BCAST4   %%zmm17 | 
		
	
		
			
				|  |  |  |  | #define BCAST5   %%zmm18 | 
		
	
		
			
				|  |  |  |  | #define BCAST6   %%zmm19 | 
		
	
		
			
				|  |  |  |  | #define BCAST7   %%zmm20 | 
		
	
		
			
				|  |  |  |  | #define BCAST8   %%zmm21 | 
		
	
		
			
				|  |  |  |  | #define BCAST9   %%zmm22 | 
		
	
		
			
				|  |  |  |  | #define BCAST10  %%zmm23 | 
		
	
		
			
				|  |  |  |  | #define BCAST11  %%zmm24 | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |   int incr=LLs*LLs*sizeof(iSinglet<Simd>); | 
		
	
		
			
				|  |  |  |  |   for(int s1=0;s1<LLs;s1++){  | 
		
	
		
			
				|  |  |  |  |     for(int s2=0;s2<LLs;s2++){  | 
		
	
		
			
				|  |  |  |  |       int lex=s2+LLs*site; | 
		
	
		
			
				|  |  |  |  |       uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable | 
		
	
		
			
				|  |  |  |  |       uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1]; | 
		
	
		
			
				|  |  |  |  |       uint64_t a2 = (uint64_t)&psi[lex]; | 
		
	
		
			
				|  |  |  |  |       for(int  l=0; l<Simd::Nsimd();l++){ // simd lane | 
		
	
		
			
				|  |  |  |  | 	if ( (s2+l)==0 ) { | 
		
	
		
			
				|  |  |  |  | 	  asm ( | 
		
	
		
			
				|  |  |  |  |   	           VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1) | 
		
	
		
			
				|  |  |  |  |   	           VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2) | 
		
	
		
			
				|  |  |  |  |   	           VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)          | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(0,%2,BCAST0)    | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(1,%2,BCAST1)    | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(2,%2,BCAST2)    | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(3,%2,BCAST3)    | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21) | 
		
	
		
			
				|  |  |  |  | 		   VMULMEM (0,%1,BCAST8,Chi_22)          | 
		
	
		
			
				|  |  |  |  | 		   VMULMEM (0,%1,BCAST9,Chi_30) | 
		
	
		
			
				|  |  |  |  | 		   VMULMEM (0,%1,BCAST10,Chi_31)        | 
		
	
		
			
				|  |  |  |  | 		   VMULMEM (0,%1,BCAST11,Chi_32) | 
		
	
		
			
				|  |  |  |  | 		   : : "r" (a0), "r" (a1), "r" (a2)  ); | 
		
	
		
			
				|  |  |  |  | 	} else {  | 
		
	
		
			
				|  |  |  |  | 	  asm ( | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31) | 
		
	
		
			
				|  |  |  |  | 		   VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32)  | 
		
	
		
			
				|  |  |  |  | 		   : : "r" (a0), "r" (a1), "r" (a2)  ); | 
		
	
		
			
				|  |  |  |  | 	} | 
		
	
		
			
				|  |  |  |  | 	a0 = a0+incr; | 
		
	
		
			
				|  |  |  |  | 	a1 = a1+incr; | 
		
	
		
			
				|  |  |  |  | 	a2 = a2+sizeof(Simd::scalar_type); | 
		
	
		
			
				|  |  |  |  |       }} | 
		
	
		
			
				|  |  |  |  |     { | 
		
	
		
			
				|  |  |  |  |       int lexa = s1+LLs*site; | 
		
	
		
			
				|  |  |  |  |       asm ( | 
		
	
		
			
				|  |  |  |  | 	       VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		 | 
		
	
		
			
				|  |  |  |  | 	       VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		 | 
		
	
		
			
				|  |  |  |  | 	       VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		 | 
		
	
		
			
				|  |  |  |  | 	       VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		 | 
		
	
		
			
				|  |  |  |  | 	       : : "r" ((uint64_t)&chi[lexa]) : "memory" ); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     } | 
		
	
		
			
				|  |  |  |  |   } | 
		
	
		
			
				|  |  |  |  |   } | 
		
	
		
			
				|  |  |  |  | #endif | 
		
	
		
			
				|  |  |  |  | }; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | template<class Impl> | 
		
	
		
			
				|  |  |  |  | void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) | 
		
	
		
			
				|  |  |  |  | { | 
		
	
	
		
			
				
					
					|  |  |  | @@ -342,37 +537,38 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField | 
		
	
		
			
				|  |  |  |  |   for(int s1=0;s1<LLs;s1++){ | 
		
	
		
			
				|  |  |  |  |     int istride = LLs; | 
		
	
		
			
				|  |  |  |  |     int ostride = 1; | 
		
	
		
			
				|  |  |  |  |       Simd Vp; | 
		
	
		
			
				|  |  |  |  |       Simd Vm; | 
		
	
		
			
				|  |  |  |  |       scalar_type *sp = (scalar_type *)&Vp; | 
		
	
		
			
				|  |  |  |  |       scalar_type *sm = (scalar_type *)&Vm; | 
		
	
		
			
				|  |  |  |  |       for(int l=0;l<Nsimd;l++){ | 
		
	
		
			
				|  |  |  |  | 	sp[l] = PplusMat (l*istride+s1*ostride ,s2); | 
		
	
		
			
				|  |  |  |  | 	sm[l] = PminusMat(l*istride+s1*ostride,s2); | 
		
	
		
			
				|  |  |  |  |       } | 
		
	
		
			
				|  |  |  |  |       Matp[LLs*s2+s1] = Vp; | 
		
	
		
			
				|  |  |  |  |       Matm[LLs*s2+s1] = Vm; | 
		
	
		
			
				|  |  |  |  |     Simd Vp; | 
		
	
		
			
				|  |  |  |  |     Simd Vm; | 
		
	
		
			
				|  |  |  |  |     scalar_type *sp = (scalar_type *)&Vp; | 
		
	
		
			
				|  |  |  |  |     scalar_type *sm = (scalar_type *)&Vm; | 
		
	
		
			
				|  |  |  |  |     for(int l=0;l<Nsimd;l++){ | 
		
	
		
			
				|  |  |  |  |       sp[l] = PplusMat (l*istride+s1*ostride,s2); | 
		
	
		
			
				|  |  |  |  |       sp[l] = scalar_type(sp[l].real(),sp[l].real()); | 
		
	
		
			
				|  |  |  |  |       sm[l] = PminusMat(l*istride+s1*ostride,s2); | 
		
	
		
			
				|  |  |  |  |       sm[l] = scalar_type(sm[l].real(),sm[l].real()); | 
		
	
		
			
				|  |  |  |  |     } | 
		
	
		
			
				|  |  |  |  |   } | 
		
	
		
			
				|  |  |  |  |     Matp[LLs*s2+s1] = Vp; | 
		
	
		
			
				|  |  |  |  |     Matm[LLs*s2+s1] = Vm; | 
		
	
		
			
				|  |  |  |  |   }} | 
		
	
		
			
				|  |  |  |  |    | 
		
	
		
			
				|  |  |  |  |    | 
		
	
		
			
				|  |  |  |  |   MooeeInvCalls++; | 
		
	
		
			
				|  |  |  |  |   MooeeInvTime-=usecond(); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |   // Dynamic allocate on stack to get per thread without serialised heap acces | 
		
	
		
			
				|  |  |  |  | #if 0 | 
		
	
		
			
				|  |  |  |  | #pragma omp parallel   | 
		
	
		
			
				|  |  |  |  |   { | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     Vector<SiteHalfSpinor> SitePplus(LLs); | 
		
	
		
			
				|  |  |  |  |     Vector<SiteHalfSpinor> SitePminus(LLs); | 
		
	
		
			
				|  |  |  |  |     Vector<SiteHalfSpinor> SiteChiP(LLs); | 
		
	
		
			
				|  |  |  |  |     Vector<SiteHalfSpinor> SiteChiM(LLs); | 
		
	
		
			
				|  |  |  |  |     Vector<SiteSpinor>     SiteChi(LLs); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     SiteHalfSpinor BcastP; | 
		
	
		
			
				|  |  |  |  |     SiteHalfSpinor BcastM; | 
		
	
		
			
				|  |  |  |  |     std::vector<SiteHalfSpinor> SitePplus(LLs); | 
		
	
		
			
				|  |  |  |  |     std::vector<SiteHalfSpinor> SitePminus(LLs); | 
		
	
		
			
				|  |  |  |  |     std::vector<SiteHalfSpinor> SiteChiP(LLs); | 
		
	
		
			
				|  |  |  |  |     std::vector<SiteHalfSpinor> SiteChiM(LLs); | 
		
	
		
			
				|  |  |  |  |     std::vector<SiteSpinor>     SiteChi(LLs); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | #pragma omp for  | 
		
	
		
			
				|  |  |  |  |   for(auto site=0;site<vol;site++){ | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     SiteHalfSpinor BcastP; | 
		
	
		
			
				|  |  |  |  |     SiteHalfSpinor BcastM; | 
		
	
		
			
				|  |  |  |  |     for(int s=0;s<LLs;s++){ | 
		
	
		
			
				|  |  |  |  |       int lex = s+LLs*site; | 
		
	
		
			
				|  |  |  |  |       spProj5p(SitePplus[s] ,psi[lex]); | 
		
	
	
		
			
				
					
					|  |  |  | @@ -390,8 +586,9 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField | 
		
	
		
			
				|  |  |  |  | 	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP; | 
		
	
		
			
				|  |  |  |  | 	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM; | 
		
	
		
			
				|  |  |  |  | 	} | 
		
	
		
			
				|  |  |  |  |       s++; | 
		
	
		
			
				|  |  |  |  |     }} | 
		
	
		
			
				|  |  |  |  | 	s++; | 
		
	
		
			
				|  |  |  |  |       } | 
		
	
		
			
				|  |  |  |  |     } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     for(int s=0;s<LLs;s++){ | 
		
	
		
			
				|  |  |  |  |       int lex = s+LLs*site; | 
		
	
	
		
			
				
					
					|  |  |  | @@ -399,8 +596,16 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField | 
		
	
		
			
				|  |  |  |  |       accumRecon5m(SiteChi[s],SiteChiM[s]); | 
		
	
		
			
				|  |  |  |  |       chi[lex] = SiteChi[s]*0.5; | 
		
	
		
			
				|  |  |  |  |     } | 
		
	
		
			
				|  |  |  |  |   }} | 
		
	
		
			
				|  |  |  |  | #else     | 
		
	
		
			
				|  |  |  |  |   PARALLEL_FOR_LOOP | 
		
	
		
			
				|  |  |  |  |   for(auto site=0;site<vol;site++){ | 
		
	
		
			
				|  |  |  |  |     MooeeInternalAsm(psi,chi, | 
		
	
		
			
				|  |  |  |  | 		     LLs,site, | 
		
	
		
			
				|  |  |  |  | 		     Matp,Matm); | 
		
	
		
			
				|  |  |  |  |   } | 
		
	
		
			
				|  |  |  |  |   } | 
		
	
		
			
				|  |  |  |  | #endif | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |   MooeeInvTime+=usecond(); | 
		
	
		
			
				|  |  |  |  | } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
	
		
			
				
					
					|  |  |  |   |