| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -34,8 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				namespace Grid {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				namespace QCD {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  /*
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				namespace QCD {  /*
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				   * Dense matrix versions of routines
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				   */
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				template<class Impl>
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      for(int v=0;v<LLs;v++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vprefetch(psi[ss+v+LLs]);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					//	vprefetch(phi[ss+v+LLs]);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					int vp= (v==LLs-1) ? 0     : v+1;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					int vm= (v==0    ) ? LLs-1 : v-1;
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -145,9 +143,6 @@ PARALLEL_FOR_LOOP
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hm_11 = psi[ss+vm]()(1)(1); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hm_12 = psi[ss+vm]()(1)(2); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					if ( vp<=v ) {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					/*
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					*/	
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;  
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12; 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					// Can force these to real arithmetic and save 2x.
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_00  = real_mult(d[v]()()(), phi[ss+v]()(0)(0))  + real_mult(l[v]()()(),hm_00); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_01  = real_mult(d[v]()()(), phi[ss+v]()(0)(1))  + real_mult(l[v]()()(),hm_01); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_02  = real_mult(d[v]()()(), phi[ss+v]()(0)(2))  + real_mult(l[v]()()(),hm_02); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_10  = real_mult(d[v]()()(), phi[ss+v]()(1)(0))  + real_mult(l[v]()()(),hm_10); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_11  = real_mult(d[v]()()(), phi[ss+v]()(1)(1))  + real_mult(l[v]()()(),hm_11); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_12  = real_mult(d[v]()()(), phi[ss+v]()(1)(2))  + real_mult(l[v]()()(),hm_12); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_20  = real_mult(d[v]()()(), phi[ss+v]()(2)(0))  + real_mult(u[v]()()(),hp_00); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_21  = real_mult(d[v]()()(), phi[ss+v]()(2)(1))  + real_mult(u[v]()()(),hp_01); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_22  = real_mult(d[v]()()(), phi[ss+v]()(2)(2))  + real_mult(u[v]()()(),hp_02);  
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_30  = real_mult(d[v]()()(), phi[ss+v]()(3)(0))  + real_mult(u[v]()()(),hp_10); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_31  = real_mult(d[v]()()(), phi[ss+v]()(3)(1))  + real_mult(u[v]()()(),hp_11); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_32  = real_mult(d[v]()()(), phi[ss+v]()(3)(2))  + real_mult(u[v]()()(),hp_12); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					//	if ( ss==0){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					/*
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					*/
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(0)(0),p_00);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(0)(1),p_01);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(0)(2),p_02);
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -261,7 +234,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  M5Dtime-=usecond();
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				PARALLEL_FOR_LOOP
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#if 0
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    alignas(64) SiteHalfSpinor hp;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    alignas(64) SiteHalfSpinor hm;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    alignas(64) SiteSpinor fp;
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -287,9 +260,231 @@ PARALLEL_FOR_LOOP
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      chi[ss+v] = chi[ss+v]     +l[v]*fm;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#else
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      for(int v=0;v<LLs;v++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vprefetch(psi[ss+v+LLs]);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					int vp= (v==LLs-1) ? 0     : v+1;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					int vm= (v==0    ) ? LLs-1 : v-1;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hp_00 = psi[ss+vp]()(0)(0); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hp_01 = psi[ss+vp]()(0)(1); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hp_02 = psi[ss+vp]()(0)(2); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hp_10 = psi[ss+vp]()(1)(0); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hp_11 = psi[ss+vp]()(1)(1); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hp_12 = psi[ss+vp]()(1)(2); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hm_00 = psi[ss+vm]()(2)(0); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hm_01 = psi[ss+vm]()(2)(1); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hm_02 = psi[ss+vm]()(2)(2); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hm_10 = psi[ss+vm]()(3)(0); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hm_11 = psi[ss+vm]()(3)(1); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd hm_12 = psi[ss+vm]()(3)(2); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					if ( vp<=v ) {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					if ( vm>=v ) {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_00  = real_mult(d[v]()()(), phi[ss+v]()(0)(0))  + real_mult(u[v]()()(),hp_00); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_01  = real_mult(d[v]()()(), phi[ss+v]()(0)(1))  + real_mult(u[v]()()(),hp_01); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_02  = real_mult(d[v]()()(), phi[ss+v]()(0)(2))  + real_mult(u[v]()()(),hp_02); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_10  = real_mult(d[v]()()(), phi[ss+v]()(1)(0))  + real_mult(u[v]()()(),hp_10); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_11  = real_mult(d[v]()()(), phi[ss+v]()(1)(1))  + real_mult(u[v]()()(),hp_11); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_12  = real_mult(d[v]()()(), phi[ss+v]()(1)(2))  + real_mult(u[v]()()(),hp_12); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_20  = real_mult(d[v]()()(), phi[ss+v]()(2)(0))  + real_mult(l[v]()()(),hm_00); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_21  = real_mult(d[v]()()(), phi[ss+v]()(2)(1))  + real_mult(l[v]()()(),hm_01); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_22  = real_mult(d[v]()()(), phi[ss+v]()(2)(2))  + real_mult(l[v]()()(),hm_02);  
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_30  = real_mult(d[v]()()(), phi[ss+v]()(3)(0))  + real_mult(l[v]()()(),hm_10); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_31  = real_mult(d[v]()()(), phi[ss+v]()(3)(1))  + real_mult(l[v]()()(),hm_11); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					Simd p_32  = real_mult(d[v]()()(), phi[ss+v]()(3)(2))  + real_mult(l[v]()()(),hm_12); 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(0)(0),p_00);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(0)(1),p_01);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(0)(2),p_02);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(1)(0),p_10);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(1)(1),p_11);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(1)(2),p_12);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(2)(0),p_20);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(2)(1),p_21);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(2)(2),p_22);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(3)(0),p_30);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(3)(1),p_31);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[ss+v]()(3)(2),p_32);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#endif
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  M5Dtime+=usecond();
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#include <simd/Intel512common.h>
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#include <simd/Intel512avx.h>
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#include <simd/Intel512single.h>
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				template<class Impl>
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
									     int LLs, int site,
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
									     Vector<iSinglet<Simd> > &Matp,
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
									     Vector<iSinglet<Simd> > &Matm)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				{
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#if 0
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  SiteHalfSpinor BcastP;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  SiteHalfSpinor BcastM;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  SiteHalfSpinor SiteChiP;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  SiteHalfSpinor SiteChiM;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  // Ls*Ls * 2 * 12 * vol flops
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  for(int s1=0;s1<LLs;s1++){ 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    for(int s2=0;s2<LLs;s2++){ 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        int s=s2+l*LLs;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					int lex=s2+LLs*site;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					if ( s2==0 && l==0) {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  SiteChiP=zero;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  SiteChiM=zero;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					for(int sp=0;sp<2;sp++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        for(int co=0;co<Nc;co++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					}}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					for(int sp=0;sp<2;sp++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        for(int co=0;co<Nc;co++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					}}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					for(int sp=0;sp<2;sp++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        for(int co=0;co<Nc;co++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					}}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    }}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      int lex = s1+LLs*site;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      for(int sp=0;sp<2;sp++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      for(int co=0;co<Nc;co++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      }}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#else
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  // pointers
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    //  MASK_REGS;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_00 %%zmm1
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_01 %%zmm2
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_02 %%zmm3
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_10 %%zmm4
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_11 %%zmm5
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_12 %%zmm6
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_20 %%zmm7
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_21 %%zmm8
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_22 %%zmm9
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_30 %%zmm10
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_31 %%zmm11
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define Chi_32 %%zmm12
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST0   %%zmm13
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST1   %%zmm14
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST2   %%zmm15
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST3   %%zmm16
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST4   %%zmm17
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST5   %%zmm18
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST6   %%zmm19
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST7   %%zmm20
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST8   %%zmm21
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST9   %%zmm22
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST10  %%zmm23
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#define BCAST11  %%zmm24
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  for(int s1=0;s1<LLs;s1++){ 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    for(int s2=0;s2<LLs;s2++){ 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      int lex=s2+LLs*site;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      uint64_t a2 = (uint64_t)&psi[lex];
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					if ( (s2+l)==0 ) {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  asm (
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  	           VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  	           VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  	           VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(0,%2,BCAST0)   
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(1,%2,BCAST1)   
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(2,%2,BCAST2)   
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(3,%2,BCAST3)   
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VMULMEM (0,%1,BCAST8,Chi_22)         
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VMULMEM (0,%1,BCAST9,Chi_30)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VMULMEM (0,%1,BCAST10,Chi_31)       
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VMULMEM (0,%1,BCAST11,Chi_32)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   : : "r" (a0), "r" (a1), "r" (a2)  );
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					} else { 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  asm (
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						   : : "r" (a0), "r" (a1), "r" (a2)  );
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					a0 = a0+incr;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					a1 = a1+incr;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					a2 = a2+sizeof(Simd::scalar_type);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      }}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      int lexa = s1+LLs*site;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      asm (
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					       VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					       VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					       VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					       VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#endif
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				};
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				template<class Impl>
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				{
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -347,32 +542,33 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    scalar_type *sp = (scalar_type *)&Vp;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    scalar_type *sm = (scalar_type *)&Vm;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    for(int l=0;l<Nsimd;l++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					sp[l] = PplusMat (l*istride+s1*ostride ,s2);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      sp[l] = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      sp[l] = scalar_type(sp[l].real(),sp[l].real());
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      sm[l] = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      sm[l] = scalar_type(sm[l].real(),sm[l].real());
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    Matp[LLs*s2+s1] = Vp;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    Matm[LLs*s2+s1] = Vm;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  }}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  MooeeInvCalls++;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  MooeeInvTime-=usecond();
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  // Dynamic allocate on stack to get per thread without serialised heap acces
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#if 0
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#pragma omp parallel  
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  {
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    Vector<SiteHalfSpinor> SitePplus(LLs);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    Vector<SiteHalfSpinor> SitePminus(LLs);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    Vector<SiteHalfSpinor> SiteChiP(LLs);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    Vector<SiteHalfSpinor> SiteChiM(LLs);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    Vector<SiteSpinor>     SiteChi(LLs);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    SiteHalfSpinor BcastP;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    SiteHalfSpinor BcastM;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    std::vector<SiteHalfSpinor> SitePplus(LLs);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    std::vector<SiteHalfSpinor> SitePminus(LLs);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    std::vector<SiteHalfSpinor> SiteChiP(LLs);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    std::vector<SiteHalfSpinor> SiteChiM(LLs);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    std::vector<SiteSpinor>     SiteChi(LLs);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#pragma omp for 
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  for(auto site=0;site<vol;site++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    SiteHalfSpinor BcastP;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    SiteHalfSpinor BcastM;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    for(int s=0;s<LLs;s++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      int lex = s+LLs*site;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      spProj5p(SitePplus[s] ,psi[lex]);
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -391,7 +587,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
					s++;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    }}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    for(int s=0;s<LLs;s++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      int lex = s+LLs*site;
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -399,8 +596,16 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      accumRecon5m(SiteChi[s],SiteChiM[s]);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				      chi[lex] = SiteChi[s]*0.5;
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  }}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#else    
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  PARALLEL_FOR_LOOP
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  for(auto site=0;site<vol;site++){
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    MooeeInternalAsm(psi,chi,
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						     LLs,site,
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
						     Matp,Matm);
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  }
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				#endif
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				  MooeeInvTime+=usecond();
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				}
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				 
 |