mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Compare commits
	
		
			14 Commits
		
	
	
		
			feature/ca
			...
			feature/bl
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					b2493d6d25 | ||
| 
						 | 
					8fd16686dc | ||
| 
						 | 
					23b9c6b5f5 | ||
| 
						 | 
					43943008bf | ||
| 
						 | 
					8ee26f9112 | ||
| 
						 | 
					f91e3af97f | ||
| 
						 | 
					43298ef681 | ||
| 
						 | 
					7e70df27e4 | ||
| 
						 | 
					c55d657736 | ||
| 
						 | 
					b89b1280d5 | ||
| 
						 | 
					ac7090e6d3 | ||
| 
						 | 
					02edbe624f | ||
| 
						 | 
					9266b89ad8 | ||
| 
						 | 
					2db7e6f8ab | 
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -10,6 +10,8 @@
 | 
				
			|||||||
*~
 | 
					*~
 | 
				
			||||||
*#
 | 
					*#
 | 
				
			||||||
*.sublime-*
 | 
					*.sublime-*
 | 
				
			||||||
 | 
					.ctags
 | 
				
			||||||
 | 
					tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Precompiled Headers #
 | 
					# Precompiled Headers #
 | 
				
			||||||
#######################
 | 
					#######################
 | 
				
			||||||
@@ -88,7 +90,6 @@ Thumbs.db
 | 
				
			|||||||
# build directory #
 | 
					# build directory #
 | 
				
			||||||
###################
 | 
					###################
 | 
				
			||||||
build*/*
 | 
					build*/*
 | 
				
			||||||
Documentation/_build
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# IDE related files #
 | 
					# IDE related files #
 | 
				
			||||||
#####################
 | 
					#####################
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -358,7 +358,7 @@ public:
 | 
				
			|||||||
    autoView( in_v , in, AcceleratorRead);
 | 
					    autoView( in_v , in, AcceleratorRead);
 | 
				
			||||||
    autoView( out_v , out, AcceleratorWrite);
 | 
					    autoView( out_v , out, AcceleratorWrite);
 | 
				
			||||||
    autoView( Stencil_v  , Stencil, AcceleratorRead);
 | 
					    autoView( Stencil_v  , Stencil, AcceleratorRead);
 | 
				
			||||||
    int npoint = geom.npoint;
 | 
					    auto& geom_v = geom;
 | 
				
			||||||
    typedef LatticeView<Cobj> Aview;
 | 
					    typedef LatticeView<Cobj> Aview;
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
    Vector<Aview> AcceleratorViewContainer;
 | 
					    Vector<Aview> AcceleratorViewContainer;
 | 
				
			||||||
@@ -380,7 +380,7 @@ public:
 | 
				
			|||||||
      int ptype;
 | 
					      int ptype;
 | 
				
			||||||
      StencilEntry *SE;
 | 
					      StencilEntry *SE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int point=0;point<npoint;point++){
 | 
					      for(int point=0;point<geom_v.npoint;point++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	SE=Stencil_v.GetEntry(ptype,point,ss);
 | 
						SE=Stencil_v.GetEntry(ptype,point,ss);
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
@@ -424,7 +424,7 @@ public:
 | 
				
			|||||||
    autoView( in_v , in, AcceleratorRead);
 | 
					    autoView( in_v , in, AcceleratorRead);
 | 
				
			||||||
    autoView( out_v , out, AcceleratorWrite);
 | 
					    autoView( out_v , out, AcceleratorWrite);
 | 
				
			||||||
    autoView( Stencil_v  , Stencil, AcceleratorRead);
 | 
					    autoView( Stencil_v  , Stencil, AcceleratorRead);
 | 
				
			||||||
    int npoint = geom.npoint;
 | 
					    auto& geom_v = geom;
 | 
				
			||||||
    typedef LatticeView<Cobj> Aview;
 | 
					    typedef LatticeView<Cobj> Aview;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Vector<Aview> AcceleratorViewContainer;
 | 
					    Vector<Aview> AcceleratorViewContainer;
 | 
				
			||||||
@@ -442,8 +442,6 @@ public:
 | 
				
			|||||||
    for(int p=0; p<geom.npoint; p++)
 | 
					    for(int p=0; p<geom.npoint; p++)
 | 
				
			||||||
      points[p] = geom.points_dagger[p];
 | 
					      points[p] = geom.points_dagger[p];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    auto points_p = &points[0];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    RealD* dag_factor_p = &dag_factor[0];
 | 
					    RealD* dag_factor_p = &dag_factor[0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
 | 
					    accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
 | 
				
			||||||
@@ -454,8 +452,8 @@ public:
 | 
				
			|||||||
      int ptype;
 | 
					      int ptype;
 | 
				
			||||||
      StencilEntry *SE;
 | 
					      StencilEntry *SE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int p=0;p<npoint;p++){
 | 
					      for(int p=0;p<geom_v.npoint;p++){
 | 
				
			||||||
        int point = points_p[p];
 | 
					        int point = points[p];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	SE=Stencil_v.GetEntry(ptype,point,ss);
 | 
						SE=Stencil_v.GetEntry(ptype,point,ss);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -710,8 +708,6 @@ public:
 | 
				
			|||||||
    for(int p=0; p<npoint; p++)
 | 
					    for(int p=0; p<npoint; p++)
 | 
				
			||||||
      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
 | 
					      points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    auto points_p = &points[0];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Vector<Aview> AcceleratorViewContainer;
 | 
					    Vector<Aview> AcceleratorViewContainer;
 | 
				
			||||||
    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
 | 
					    for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
 | 
				
			||||||
    Aview *Aview_p = & AcceleratorViewContainer[0];
 | 
					    Aview *Aview_p = & AcceleratorViewContainer[0];
 | 
				
			||||||
@@ -732,7 +728,7 @@ public:
 | 
				
			|||||||
        StencilEntry *SE;
 | 
					        StencilEntry *SE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for(int p=0;p<npoint;p++){
 | 
					        for(int p=0;p<npoint;p++){
 | 
				
			||||||
          int point = points_p[p];
 | 
					          int point = points[p];
 | 
				
			||||||
          SE=st_v.GetEntry(ptype,point,ss);
 | 
					          SE=st_v.GetEntry(ptype,point,ss);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          if(SE->_is_local) {
 | 
					          if(SE->_is_local) {
 | 
				
			||||||
@@ -758,7 +754,7 @@ public:
 | 
				
			|||||||
        StencilEntry *SE;
 | 
					        StencilEntry *SE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for(int p=0;p<npoint;p++){
 | 
					        for(int p=0;p<npoint;p++){
 | 
				
			||||||
          int point = points_p[p];
 | 
					          int point = points[p];
 | 
				
			||||||
          SE=st_v.GetEntry(ptype,point,ss);
 | 
					          SE=st_v.GetEntry(ptype,point,ss);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          if(SE->_is_local) {
 | 
					          if(SE->_is_local) {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -136,7 +136,7 @@ public:
 | 
				
			|||||||
    flops=0;
 | 
					    flops=0;
 | 
				
			||||||
    usec =0;
 | 
					    usec =0;
 | 
				
			||||||
    Coordinate layout(Nd,1);
 | 
					    Coordinate layout(Nd,1);
 | 
				
			||||||
    sgrid = new GridCartesian(dimensions,layout,processors,*grid);
 | 
					    sgrid = new GridCartesian(dimensions,layout,processors);
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
  ~FFT ( void)  {
 | 
					  ~FFT ( void)  {
 | 
				
			||||||
@@ -182,7 +182,7 @@ public:
 | 
				
			|||||||
    pencil_gd[dim] = G*processors[dim];
 | 
					    pencil_gd[dim] = G*processors[dim];
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
    // Pencil global vol LxLxGxLxL per node
 | 
					    // Pencil global vol LxLxGxLxL per node
 | 
				
			||||||
    GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
 | 
					    GridCartesian pencil_g(pencil_gd,layout,processors);
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
    // Construct pencils
 | 
					    // Construct pencils
 | 
				
			||||||
    typedef typename vobj::scalar_object sobj;
 | 
					    typedef typename vobj::scalar_object sobj;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -52,7 +52,6 @@ public:
 | 
				
			|||||||
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
 | 
					  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
 | 
				
			||||||
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
 | 
					  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
 | 
				
			||||||
  virtual void HermOp(const Field &in, Field &out)=0;
 | 
					  virtual void HermOp(const Field &in, Field &out)=0;
 | 
				
			||||||
  virtual ~LinearOperatorBase(){};
 | 
					 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -508,7 +507,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
 | 
				
			|||||||
  virtual  void MpcDag   (const Field &in, Field &out){
 | 
					  virtual  void MpcDag   (const Field &in, Field &out){
 | 
				
			||||||
    Mpc(in,out);
 | 
					    Mpc(in,out);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  virtual void MpcDagMpc(const Field &in, Field &out) {
 | 
					  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
 | 
				
			||||||
    assert(0);// Never need with staggered
 | 
					    assert(0);// Never need with staggered
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
@@ -531,16 +530,6 @@ public:
 | 
				
			|||||||
template<class Field> class LinearFunction {
 | 
					template<class Field> class LinearFunction {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  virtual void operator() (const Field &in, Field &out) = 0;
 | 
					  virtual void operator() (const Field &in, Field &out) = 0;
 | 
				
			||||||
 | 
					 | 
				
			||||||
  virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    assert(in.size() == out.size());
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for (unsigned int i = 0; i < in.size(); ++i)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      (*this)(in[i], out[i]);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
 | 
					template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
 | 
				
			||||||
@@ -586,7 +575,6 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
 | 
				
			|||||||
template<typename Field>
 | 
					template<typename Field>
 | 
				
			||||||
class PlainHermOp : public LinearFunction<Field> {
 | 
					class PlainHermOp : public LinearFunction<Field> {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  using LinearFunction<Field>::operator();
 | 
					 | 
				
			||||||
  LinearOperatorBase<Field> &_Linop;
 | 
					  LinearOperatorBase<Field> &_Linop;
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
 | 
					  PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) 
 | 
				
			||||||
@@ -600,7 +588,6 @@ public:
 | 
				
			|||||||
template<typename Field>
 | 
					template<typename Field>
 | 
				
			||||||
class FunctionHermOp : public LinearFunction<Field> {
 | 
					class FunctionHermOp : public LinearFunction<Field> {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  using LinearFunction<Field>::operator(); 
 | 
					 | 
				
			||||||
  OperatorFunction<Field>   & _poly;
 | 
					  OperatorFunction<Field>   & _poly;
 | 
				
			||||||
  LinearOperatorBase<Field> &_Linop;
 | 
					  LinearOperatorBase<Field> &_Linop;
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -30,19 +30,13 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
NAMESPACE_BEGIN(Grid);
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Field> using Preconditioner =  LinearFunction<Field> ;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
template<class Field> class Preconditioner :  public LinearFunction<Field> { 
 | 
					template<class Field> class Preconditioner :  public LinearFunction<Field> { 
 | 
				
			||||||
  using LinearFunction<Field>::operator();
 | 
					 | 
				
			||||||
  virtual void operator()(const Field &src, Field & psi)=0;
 | 
					  virtual void operator()(const Field &src, Field & psi)=0;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
*/
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 | 
					template<class Field> class TrivialPrecon :  public Preconditioner<Field> { 
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  using Preconditioner<Field>::operator();
 | 
					  void operator()(const Field &src, Field & psi){
 | 
				
			||||||
  virtual void operator()(const Field &src, Field & psi){
 | 
					 | 
				
			||||||
    psi = src;
 | 
					    psi = src;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  TrivialPrecon(void){};
 | 
					  TrivialPrecon(void){};
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -48,7 +48,6 @@ public:
 | 
				
			|||||||
  virtual  void Mdiag    (const Field &in, Field &out)=0;
 | 
					  virtual  void Mdiag    (const Field &in, Field &out)=0;
 | 
				
			||||||
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
 | 
					  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
 | 
				
			||||||
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
 | 
					  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
 | 
				
			||||||
  virtual ~SparseMatrixBase() {};
 | 
					 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/////////////////////////////////////////////////////////////////////////////////////////////
 | 
					/////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -73,7 +72,7 @@ public:
 | 
				
			|||||||
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
 | 
					  virtual  void MeooeDag    (const Field &in, Field &out)=0;
 | 
				
			||||||
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
 | 
					  virtual  void MooeeDag    (const Field &in, Field &out)=0;
 | 
				
			||||||
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
 | 
					  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
 | 
				
			||||||
  virtual ~CheckerBoardedSparseMatrixBase() {};
 | 
					
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -240,12 +240,14 @@ public:
 | 
				
			|||||||
    Field T0(grid); T0 = in;  
 | 
					    Field T0(grid); T0 = in;  
 | 
				
			||||||
    Field T1(grid); 
 | 
					    Field T1(grid); 
 | 
				
			||||||
    Field T2(grid);
 | 
					    Field T2(grid);
 | 
				
			||||||
 | 
					    Field Tout(grid);
 | 
				
			||||||
    Field y(grid);
 | 
					    Field y(grid);
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
    Field *Tnm = &T0;
 | 
					    Field *Tnm = &T0;
 | 
				
			||||||
    Field *Tn  = &T1;
 | 
					    Field *Tn  = &T1;
 | 
				
			||||||
    Field *Tnp = &T2;
 | 
					    Field *Tnp = &T2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage << "Chebyshev() starts"<<std::endl;
 | 
				
			||||||
    // Tn=T1 = (xscale M + mscale)in
 | 
					    // Tn=T1 = (xscale M + mscale)in
 | 
				
			||||||
    RealD xscale = 2.0/(hi-lo);
 | 
					    RealD xscale = 2.0/(hi-lo);
 | 
				
			||||||
    RealD mscale = -(hi+lo)/(hi-lo);
 | 
					    RealD mscale = -(hi+lo)/(hi-lo);
 | 
				
			||||||
@@ -254,7 +256,7 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    // sum = .5 c[0] T0 + c[1] T1
 | 
					    // sum = .5 c[0] T0 + c[1] T1
 | 
				
			||||||
    //    out = ()*T0 + Coeffs[1]*T1;
 | 
					    //    out = ()*T0 + Coeffs[1]*T1;
 | 
				
			||||||
    axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
 | 
					    axpby(Tout,0.5*Coeffs[0],Coeffs[1],T0,T1);
 | 
				
			||||||
    for(int n=2;n<order;n++){
 | 
					    for(int n=2;n<order;n++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      Linop.HermOp(*Tn,y);
 | 
					      Linop.HermOp(*Tn,y);
 | 
				
			||||||
@@ -275,7 +277,7 @@ public:
 | 
				
			|||||||
      axpby(y,xscale,mscale,y,(*Tn));
 | 
					      axpby(y,xscale,mscale,y,(*Tn));
 | 
				
			||||||
      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
 | 
					      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
 | 
				
			||||||
      if ( Coeffs[n] != 0.0) {
 | 
					      if ( Coeffs[n] != 0.0) {
 | 
				
			||||||
	axpy(out,Coeffs[n],*Tnp,out);
 | 
						axpy(Tout,Coeffs[n],*Tnp,Tout);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
      // Cycle pointers to avoid copies
 | 
					      // Cycle pointers to avoid copies
 | 
				
			||||||
@@ -285,6 +287,8 @@ public:
 | 
				
			|||||||
      Tnp    =swizzle;
 | 
					      Tnp    =swizzle;
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    out = Tout;
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage << "Chebyshev() ends"<<std::endl;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -377,24 +381,26 @@ public:
 | 
				
			|||||||
    Field T0(grid); T0 = in;  
 | 
					    Field T0(grid); T0 = in;  
 | 
				
			||||||
    Field T1(grid); 
 | 
					    Field T1(grid); 
 | 
				
			||||||
    Field T2(grid);
 | 
					    Field T2(grid);
 | 
				
			||||||
 | 
					    Field Tout(grid);
 | 
				
			||||||
    Field  y(grid);
 | 
					    Field  y(grid);
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
    Field *Tnm = &T0;
 | 
					    Field *Tnm = &T0;
 | 
				
			||||||
    Field *Tn  = &T1;
 | 
					    Field *Tn  = &T1;
 | 
				
			||||||
    Field *Tnp = &T2;
 | 
					    Field *Tnp = &T2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage << "ChebyshevLanczos() starts"<<std::endl;
 | 
				
			||||||
    // Tn=T1 = (xscale M )*in
 | 
					    // Tn=T1 = (xscale M )*in
 | 
				
			||||||
    AminusMuSq(Linop,T0,T1);
 | 
					    AminusMuSq(Linop,T0,T1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // sum = .5 c[0] T0 + c[1] T1
 | 
					    // sum = .5 c[0] T0 + c[1] T1
 | 
				
			||||||
    out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
 | 
					    Tout = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
 | 
				
			||||||
    for(int n=2;n<order;n++){
 | 
					    for(int n=2;n<order;n++){
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
      AminusMuSq(Linop,*Tn,y);
 | 
					      AminusMuSq(Linop,*Tn,y);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      *Tnp=2.0*y-(*Tnm);
 | 
					      *Tnp=2.0*y-(*Tnm);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      out=out+Coeffs[n]* (*Tnp);
 | 
					      Tout=Tout+Coeffs[n]* (*Tnp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // Cycle pointers to avoid copies
 | 
					      // Cycle pointers to avoid copies
 | 
				
			||||||
      Field *swizzle = Tnm;
 | 
					      Field *swizzle = Tnm;
 | 
				
			||||||
@@ -403,6 +409,8 @@ public:
 | 
				
			|||||||
      Tnp    =swizzle;
 | 
					      Tnp    =swizzle;
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    out=Tout;
 | 
				
			||||||
 | 
					    std::cout << GridLogMessage << "ChebyshevLanczos() ends"<<std::endl;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -37,7 +37,6 @@ template<class FieldD, class FieldF, typename std::enable_if< getPrecision<Field
 | 
				
			|||||||
class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 | 
					class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> 
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  public:                                                
 | 
					  public:                                                
 | 
				
			||||||
    using LinearFunction<FieldD>::operator();
 | 
					 | 
				
			||||||
    RealD   Tolerance;
 | 
					    RealD   Tolerance;
 | 
				
			||||||
    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
 | 
					    RealD   InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
 | 
				
			||||||
    Integer MaxInnerIterations;
 | 
					    Integer MaxInnerIterations;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -36,7 +36,6 @@ NAMESPACE_BEGIN(Grid);
 | 
				
			|||||||
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 | 
					    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 | 
				
			||||||
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
 | 
					  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
 | 
				
			||||||
  public:                                                
 | 
					  public:                                                
 | 
				
			||||||
    using LinearFunction<FieldD>::operator();
 | 
					 | 
				
			||||||
    RealD   Tolerance;
 | 
					    RealD   Tolerance;
 | 
				
			||||||
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
 | 
					    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
 | 
				
			||||||
    Integer MaxInnerIterations;
 | 
					    Integer MaxInnerIterations;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -33,19 +33,16 @@ namespace Grid {
 | 
				
			|||||||
template<class Field>
 | 
					template<class Field>
 | 
				
			||||||
class ZeroGuesser: public LinearFunction<Field> {
 | 
					class ZeroGuesser: public LinearFunction<Field> {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  using LinearFunction<Field>::operator();
 | 
					 | 
				
			||||||
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 | 
					    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
template<class Field>
 | 
					template<class Field>
 | 
				
			||||||
class DoNothingGuesser: public LinearFunction<Field> {
 | 
					class DoNothingGuesser: public LinearFunction<Field> {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  using LinearFunction<Field>::operator();
 | 
					 | 
				
			||||||
  virtual void operator()(const Field &src, Field &guess) {  };
 | 
					  virtual void operator()(const Field &src, Field &guess) {  };
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
template<class Field>
 | 
					template<class Field>
 | 
				
			||||||
class SourceGuesser: public LinearFunction<Field> {
 | 
					class SourceGuesser: public LinearFunction<Field> {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  using LinearFunction<Field>::operator();
 | 
					 | 
				
			||||||
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 | 
					  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -57,24 +54,15 @@ class DeflatedGuesser: public LinearFunction<Field> {
 | 
				
			|||||||
private:
 | 
					private:
 | 
				
			||||||
  const std::vector<Field> &evec;
 | 
					  const std::vector<Field> &evec;
 | 
				
			||||||
  const std::vector<RealD> &eval;
 | 
					  const std::vector<RealD> &eval;
 | 
				
			||||||
  const unsigned int       N;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  using LinearFunction<Field>::operator();
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
 | 
					  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
 | 
				
			||||||
  : DeflatedGuesser(_evec, _eval, _evec.size())
 | 
					 | 
				
			||||||
  {}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
 | 
					 | 
				
			||||||
  : evec(_evec), eval(_eval), N(_N)
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    assert(evec.size()==eval.size());
 | 
					 | 
				
			||||||
    assert(N <= evec.size());
 | 
					 | 
				
			||||||
  } 
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  virtual void operator()(const Field &src,Field &guess) {
 | 
					  virtual void operator()(const Field &src,Field &guess) {
 | 
				
			||||||
    guess = Zero();
 | 
					    guess = Zero();
 | 
				
			||||||
 | 
					    assert(evec.size()==eval.size());
 | 
				
			||||||
 | 
					    auto N = evec.size();
 | 
				
			||||||
    for (int i=0;i<N;i++) {
 | 
					    for (int i=0;i<N;i++) {
 | 
				
			||||||
      const Field& tmp = evec[i];
 | 
					      const Field& tmp = evec[i];
 | 
				
			||||||
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
 | 
					      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
 | 
				
			||||||
@@ -91,7 +79,6 @@ private:
 | 
				
			|||||||
  const std::vector<RealD>       &eval_coarse;
 | 
					  const std::vector<RealD>       &eval_coarse;
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  using LinearFunction<FineField>::operator();
 | 
					 | 
				
			||||||
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 | 
					  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 | 
				
			||||||
				const std::vector<CoarseField> &_evec_coarse,
 | 
									const std::vector<CoarseField> &_evec_coarse,
 | 
				
			||||||
				const std::vector<RealD>       &_eval_coarse)
 | 
									const std::vector<RealD>       &_eval_coarse)
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										1555
									
								
								Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1555
									
								
								Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -67,7 +67,6 @@ public:
 | 
				
			|||||||
template<class Fobj,class CComplex,int nbasis>
 | 
					template<class Fobj,class CComplex,int nbasis>
 | 
				
			||||||
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 | 
					class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
 | 
					 | 
				
			||||||
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
 | 
					  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
 | 
				
			||||||
  typedef Lattice<CoarseSiteVector>           CoarseField;
 | 
					  typedef Lattice<CoarseSiteVector>           CoarseField;
 | 
				
			||||||
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
 | 
					  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
 | 
				
			||||||
@@ -98,7 +97,6 @@ public:
 | 
				
			|||||||
template<class Fobj,class CComplex,int nbasis>
 | 
					template<class Fobj,class CComplex,int nbasis>
 | 
				
			||||||
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 | 
					class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
 | 
					 | 
				
			||||||
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
 | 
					  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
 | 
				
			||||||
  typedef Lattice<CoarseSiteVector>           CoarseField;
 | 
					  typedef Lattice<CoarseSiteVector>           CoarseField;
 | 
				
			||||||
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
 | 
					  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 | 
				
			|||||||
template<class Field>
 | 
					template<class Field>
 | 
				
			||||||
class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 | 
					class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
 | 
				
			||||||
public:                                                
 | 
					public:                                                
 | 
				
			||||||
  using LinearFunction<Field>::operator();
 | 
					
 | 
				
			||||||
  RealD   Tolerance;
 | 
					  RealD   Tolerance;
 | 
				
			||||||
  Integer MaxIterations;
 | 
					  Integer MaxIterations;
 | 
				
			||||||
  int verbose;
 | 
					  int verbose;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
 | 
				
			|||||||
template<class Field>
 | 
					template<class Field>
 | 
				
			||||||
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 | 
					class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 | 
				
			||||||
public:                                                
 | 
					public:                                                
 | 
				
			||||||
  using LinearFunction<Field>::operator();
 | 
					
 | 
				
			||||||
  RealD   Tolerance;
 | 
					  RealD   Tolerance;
 | 
				
			||||||
  Integer MaxIterations;
 | 
					  Integer MaxIterations;
 | 
				
			||||||
  int verbose;
 | 
					  int verbose;
 | 
				
			||||||
@@ -119,8 +119,7 @@ public:
 | 
				
			|||||||
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
 | 
					  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    RealD cp;
 | 
					    RealD cp;
 | 
				
			||||||
    ComplexD a, b;
 | 
					    ComplexD a, b, zAz;
 | 
				
			||||||
    //    ComplexD zAz;
 | 
					 | 
				
			||||||
    RealD zAAz;
 | 
					    RealD zAAz;
 | 
				
			||||||
    ComplexD rq;
 | 
					    ComplexD rq;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -147,7 +146,7 @@ public:
 | 
				
			|||||||
    //////////////////////////////////
 | 
					    //////////////////////////////////
 | 
				
			||||||
    MatTimer.Start();
 | 
					    MatTimer.Start();
 | 
				
			||||||
    Linop.Op(psi,Az);
 | 
					    Linop.Op(psi,Az);
 | 
				
			||||||
    //    zAz = innerProduct(Az,psi);
 | 
					    zAz = innerProduct(Az,psi);
 | 
				
			||||||
    zAAz= norm2(Az);
 | 
					    zAAz= norm2(Az);
 | 
				
			||||||
    MatTimer.Stop();
 | 
					    MatTimer.Stop();
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
@@ -171,7 +170,7 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    LinalgTimer.Start();
 | 
					    LinalgTimer.Start();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    //    zAz = innerProduct(Az,psi);
 | 
					    zAz = innerProduct(Az,psi);
 | 
				
			||||||
    zAAz= norm2(Az);
 | 
					    zAAz= norm2(Az);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    //p[0],q[0],qq[0] 
 | 
					    //p[0],q[0],qq[0] 
 | 
				
			||||||
@@ -213,7 +212,7 @@ public:
 | 
				
			|||||||
      MatTimer.Start();
 | 
					      MatTimer.Start();
 | 
				
			||||||
      Linop.Op(z,Az);
 | 
					      Linop.Op(z,Az);
 | 
				
			||||||
      MatTimer.Stop();
 | 
					      MatTimer.Stop();
 | 
				
			||||||
      //      zAz = innerProduct(Az,psi);
 | 
					      zAz = innerProduct(Az,psi);
 | 
				
			||||||
      zAAz= norm2(Az);
 | 
					      zAAz= norm2(Az);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      LinalgTimer.Start();
 | 
					      LinalgTimer.Start();
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -132,31 +132,6 @@ namespace Grid {
 | 
				
			|||||||
      (*this)(_Matrix,in,out,guess);
 | 
					      (*this)(_Matrix,in,out,guess);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    void RedBlackSource(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &src_o) 
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      GridBase *grid = _Matrix.RedBlackGrid();
 | 
					 | 
				
			||||||
      Field tmp(grid);
 | 
					 | 
				
			||||||
      int nblock = in.size();
 | 
					 | 
				
			||||||
      for(int b=0;b<nblock;b++){
 | 
					 | 
				
			||||||
	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    // James can write his own deflated guesser
 | 
					 | 
				
			||||||
    // with optimised code for the inner products
 | 
					 | 
				
			||||||
    //    RedBlackSolveSplitGrid();
 | 
					 | 
				
			||||||
    //    RedBlackSolve(_Matrix,src_o,sol_o); 
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    void RedBlackSolution(Matrix &_Matrix, const std::vector<Field> &in, const std::vector<Field> &sol_o, std::vector<Field> &out)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      GridBase *grid = _Matrix.RedBlackGrid();
 | 
					 | 
				
			||||||
      Field tmp(grid);
 | 
					 | 
				
			||||||
      int nblock = in.size();
 | 
					 | 
				
			||||||
      for(int b=0;b<nblock;b++) {
 | 
					 | 
				
			||||||
	pickCheckerboard(Even,tmp,in[b]);
 | 
					 | 
				
			||||||
	RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    template<class Guesser>
 | 
					    template<class Guesser>
 | 
				
			||||||
    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
 | 
					    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
@@ -175,29 +150,24 @@ namespace Grid {
 | 
				
			|||||||
      ////////////////////////////////////////////////
 | 
					      ////////////////////////////////////////////////
 | 
				
			||||||
      // Prepare RedBlack source
 | 
					      // Prepare RedBlack source
 | 
				
			||||||
      ////////////////////////////////////////////////
 | 
					      ////////////////////////////////////////////////
 | 
				
			||||||
      RedBlackSource(_Matrix,in,src_o);
 | 
					      for(int b=0;b<nblock;b++){
 | 
				
			||||||
	//      for(int b=0;b<nblock;b++){
 | 
						RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
 | 
				
			||||||
	//	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
 | 
					      }
 | 
				
			||||||
	//      }
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      ////////////////////////////////////////////////
 | 
					      ////////////////////////////////////////////////
 | 
				
			||||||
      // Make the guesses
 | 
					      // Make the guesses
 | 
				
			||||||
      ////////////////////////////////////////////////
 | 
					      ////////////////////////////////////////////////
 | 
				
			||||||
      if ( subGuess ) guess_save.resize(nblock,grid);
 | 
					      if ( subGuess ) guess_save.resize(nblock,grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      
 | 
					      for(int b=0;b<nblock;b++){
 | 
				
			||||||
      if(useSolnAsInitGuess) {
 | 
					        if(useSolnAsInitGuess) {
 | 
				
			||||||
        for(int b=0;b<nblock;b++){
 | 
					 | 
				
			||||||
          pickCheckerboard(Odd, sol_o[b], out[b]);
 | 
					          pickCheckerboard(Odd, sol_o[b], out[b]);
 | 
				
			||||||
 | 
					        } else {
 | 
				
			||||||
 | 
					          guess(src_o[b],sol_o[b]); 
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      } else {
 | 
					 | 
				
			||||||
        guess(src_o, sol_o); 
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    if ( subGuess ) { 
 | 
						if ( subGuess ) { 
 | 
				
			||||||
        for(int b=0;b<nblock;b++){
 | 
						  guess_save[b] = sol_o[b];
 | 
				
			||||||
          guess_save[b] = sol_o[b];
 | 
						}
 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      //////////////////////////////////////////////////////////////
 | 
					      //////////////////////////////////////////////////////////////
 | 
				
			||||||
      // Call the block solver
 | 
					      // Call the block solver
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -9,30 +9,14 @@ NAMESPACE_BEGIN(Grid);
 | 
				
			|||||||
#define AccSmall (3)
 | 
					#define AccSmall (3)
 | 
				
			||||||
#define Shared   (4)
 | 
					#define Shared   (4)
 | 
				
			||||||
#define SharedSmall (5)
 | 
					#define SharedSmall (5)
 | 
				
			||||||
#undef GRID_MM_VERBOSE 
 | 
					 | 
				
			||||||
uint64_t total_shared;
 | 
					uint64_t total_shared;
 | 
				
			||||||
uint64_t total_device;
 | 
					uint64_t total_device;
 | 
				
			||||||
uint64_t total_host;;
 | 
					uint64_t total_host;;
 | 
				
			||||||
void MemoryManager::PrintBytes(void)
 | 
					void MemoryManager::PrintBytes(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
 | 
					  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
 | 
				
			||||||
  std::cout << " MemoryManager : PrintBytes "<<std::endl;
 | 
					  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
 | 
				
			||||||
  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
 | 
					  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
 | 
				
			||||||
  std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared      Mbytes "<<std::endl;
 | 
					 | 
				
			||||||
  std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
 | 
					 | 
				
			||||||
  std::cout << " MemoryManager : "<<(total_host>>20)  <<" cpu         Mbytes "<<std::endl;
 | 
					 | 
				
			||||||
  uint64_t cacheBytes;
 | 
					 | 
				
			||||||
  cacheBytes = CacheBytes[Cpu];
 | 
					 | 
				
			||||||
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
 | 
					 | 
				
			||||||
  cacheBytes = CacheBytes[Acc];
 | 
					 | 
				
			||||||
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
 | 
					 | 
				
			||||||
  cacheBytes = CacheBytes[Shared];
 | 
					 | 
				
			||||||
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
#ifdef GRID_CUDA
 | 
					 | 
				
			||||||
  cuda_mem();
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//////////////////////////////////////////////////////////////////////
 | 
					//////////////////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -40,114 +24,88 @@ void MemoryManager::PrintBytes(void)
 | 
				
			|||||||
//////////////////////////////////////////////////////////////////////
 | 
					//////////////////////////////////////////////////////////////////////
 | 
				
			||||||
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 | 
					MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 | 
				
			||||||
int MemoryManager::Victim[MemoryManager::NallocType];
 | 
					int MemoryManager::Victim[MemoryManager::NallocType];
 | 
				
			||||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
 | 
					int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
 | 
				
			||||||
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 | 
					
 | 
				
			||||||
//////////////////////////////////////////////////////////////////////
 | 
					//////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// Actual allocation and deallocation utils
 | 
					// Actual allocation and deallocation utils
 | 
				
			||||||
//////////////////////////////////////////////////////////////////////
 | 
					//////////////////////////////////////////////////////////////////////
 | 
				
			||||||
void *MemoryManager::AcceleratorAllocate(size_t bytes)
 | 
					void *MemoryManager::AcceleratorAllocate(size_t bytes)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  total_device+=bytes;
 | 
					 | 
				
			||||||
  void *ptr = (void *) Lookup(bytes,Acc);
 | 
					  void *ptr = (void *) Lookup(bytes,Acc);
 | 
				
			||||||
  if ( ptr == (void *) NULL ) {
 | 
					  if ( ptr == (void *) NULL ) {
 | 
				
			||||||
    ptr = (void *) acceleratorAllocDevice(bytes);
 | 
					    ptr = (void *) acceleratorAllocDevice(bytes);
 | 
				
			||||||
 | 
					    total_device+=bytes;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#ifdef GRID_MM_VERBOSE
 | 
					 | 
				
			||||||
  std::cout <<"AcceleratorAllocate "<<std::endl;
 | 
					 | 
				
			||||||
  PrintBytes();
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  return ptr;
 | 
					  return ptr;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 | 
					void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  total_device-=bytes;
 | 
					 | 
				
			||||||
  void *__freeme = Insert(ptr,bytes,Acc);
 | 
					  void *__freeme = Insert(ptr,bytes,Acc);
 | 
				
			||||||
  if ( __freeme ) {
 | 
					  if ( __freeme ) {
 | 
				
			||||||
    acceleratorFreeDevice(__freeme);
 | 
					    acceleratorFreeDevice(__freeme);
 | 
				
			||||||
 | 
					    total_device-=bytes;
 | 
				
			||||||
 | 
					//       PrintBytes();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#ifdef GRID_MM_VERBOSE
 | 
					 | 
				
			||||||
  std::cout <<"AcceleratorFree "<<std::endl;
 | 
					 | 
				
			||||||
  PrintBytes();
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void *MemoryManager::SharedAllocate(size_t bytes)
 | 
					void *MemoryManager::SharedAllocate(size_t bytes)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  total_shared+=bytes;
 | 
					 | 
				
			||||||
  void *ptr = (void *) Lookup(bytes,Shared);
 | 
					  void *ptr = (void *) Lookup(bytes,Shared);
 | 
				
			||||||
  if ( ptr == (void *) NULL ) {
 | 
					  if ( ptr == (void *) NULL ) {
 | 
				
			||||||
    ptr = (void *) acceleratorAllocShared(bytes);
 | 
					    ptr = (void *) acceleratorAllocShared(bytes);
 | 
				
			||||||
 | 
					    total_shared+=bytes;
 | 
				
			||||||
 | 
					        std::cout <<GridLogMessage<<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
 | 
				
			||||||
 | 
					//        PrintBytes();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#ifdef GRID_MM_VERBOSE
 | 
					 | 
				
			||||||
  std::cout <<"SharedAllocate "<<std::endl;
 | 
					 | 
				
			||||||
  PrintBytes();
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  return ptr;
 | 
					  return ptr;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 | 
					void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  total_shared-=bytes;
 | 
					 | 
				
			||||||
  void *__freeme = Insert(ptr,bytes,Shared);
 | 
					  void *__freeme = Insert(ptr,bytes,Shared);
 | 
				
			||||||
  if ( __freeme ) {
 | 
					  if ( __freeme ) {
 | 
				
			||||||
    acceleratorFreeShared(__freeme);
 | 
					    acceleratorFreeShared(__freeme);
 | 
				
			||||||
 | 
					    total_shared-=bytes;
 | 
				
			||||||
 | 
					    //    PrintBytes();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#ifdef GRID_MM_VERBOSE
 | 
					 | 
				
			||||||
  std::cout <<"SharedFree "<<std::endl;
 | 
					 | 
				
			||||||
  PrintBytes();
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#ifdef GRID_UVM
 | 
					#ifdef GRID_UVM
 | 
				
			||||||
void *MemoryManager::CpuAllocate(size_t bytes)
 | 
					void *MemoryManager::CpuAllocate(size_t bytes)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  total_host+=bytes;
 | 
					 | 
				
			||||||
  void *ptr = (void *) Lookup(bytes,Cpu);
 | 
					  void *ptr = (void *) Lookup(bytes,Cpu);
 | 
				
			||||||
  if ( ptr == (void *) NULL ) {
 | 
					  if ( ptr == (void *) NULL ) {
 | 
				
			||||||
    ptr = (void *) acceleratorAllocShared(bytes);
 | 
					    ptr = (void *) acceleratorAllocShared(bytes);
 | 
				
			||||||
 | 
					    total_host+=bytes;
 | 
				
			||||||
 | 
					//    std::cout << GridLogMessage<< "MemoryManager:: CpuAllocate  total_host= "<<total_host<<" "<< ptr << std::endl;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#ifdef GRID_MM_VERBOSE
 | 
					 | 
				
			||||||
  std::cout <<"CpuAllocate "<<std::endl;
 | 
					 | 
				
			||||||
  PrintBytes();
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  return ptr;
 | 
					  return ptr;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 | 
					void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  total_host-=bytes;
 | 
					 | 
				
			||||||
  NotifyDeletion(_ptr);
 | 
					  NotifyDeletion(_ptr);
 | 
				
			||||||
  void *__freeme = Insert(_ptr,bytes,Cpu);
 | 
					  void *__freeme = Insert(_ptr,bytes,Cpu);
 | 
				
			||||||
  if ( __freeme ) { 
 | 
					  if ( __freeme ) { 
 | 
				
			||||||
    acceleratorFreeShared(__freeme);
 | 
					    acceleratorFreeShared(__freeme);
 | 
				
			||||||
 | 
					//    std::cout << GridLogMessage<< "MemoryManager:: CpuFree  total_host= "<<total_host<<" "<< __freeme << std::endl;
 | 
				
			||||||
 | 
					    total_host-=bytes;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#ifdef GRID_MM_VERBOSE
 | 
					 | 
				
			||||||
  std::cout <<"CpuFree "<<std::endl;
 | 
					 | 
				
			||||||
  PrintBytes();
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
void *MemoryManager::CpuAllocate(size_t bytes)
 | 
					void *MemoryManager::CpuAllocate(size_t bytes)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  total_host+=bytes;
 | 
					 | 
				
			||||||
  void *ptr = (void *) Lookup(bytes,Cpu);
 | 
					  void *ptr = (void *) Lookup(bytes,Cpu);
 | 
				
			||||||
  if ( ptr == (void *) NULL ) {
 | 
					  if ( ptr == (void *) NULL ) {
 | 
				
			||||||
    ptr = (void *) acceleratorAllocCpu(bytes);
 | 
					    ptr = (void *) acceleratorAllocCpu(bytes);
 | 
				
			||||||
 | 
					    total_host+=bytes;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#ifdef GRID_MM_VERBOSE
 | 
					 | 
				
			||||||
  std::cout <<"CpuAllocate "<<std::endl;
 | 
					 | 
				
			||||||
  PrintBytes();
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  return ptr;
 | 
					  return ptr;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 | 
					void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  total_host-=bytes;
 | 
					 | 
				
			||||||
  NotifyDeletion(_ptr);
 | 
					  NotifyDeletion(_ptr);
 | 
				
			||||||
  void *__freeme = Insert(_ptr,bytes,Cpu);
 | 
					  void *__freeme = Insert(_ptr,bytes,Cpu);
 | 
				
			||||||
  if ( __freeme ) { 
 | 
					  if ( __freeme ) { 
 | 
				
			||||||
    acceleratorFreeCpu(__freeme);
 | 
					    acceleratorFreeCpu(__freeme);
 | 
				
			||||||
 | 
					    total_host-=bytes;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#ifdef GRID_MM_VERBOSE
 | 
					 | 
				
			||||||
  std::cout <<"CpuFree "<<std::endl;
 | 
					 | 
				
			||||||
  PrintBytes();
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -159,6 +117,7 @@ void MemoryManager::Init(void)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  char * str;
 | 
					  char * str;
 | 
				
			||||||
  int Nc;
 | 
					  int Nc;
 | 
				
			||||||
 | 
					  int NcS;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
 | 
					  str= getenv("GRID_ALLOC_NCACHE_LARGE");
 | 
				
			||||||
  if ( str ) {
 | 
					  if ( str ) {
 | 
				
			||||||
@@ -224,13 +183,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 | 
				
			|||||||
#ifdef ALLOCATION_CACHE
 | 
					#ifdef ALLOCATION_CACHE
 | 
				
			||||||
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
 | 
					  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
 | 
				
			||||||
  int cache = type + small;
 | 
					  int cache = type + small;
 | 
				
			||||||
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 | 
					  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
  return ptr;
 | 
					  return ptr;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 | 
					void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  assert(ncache>0);
 | 
					  assert(ncache>0);
 | 
				
			||||||
#ifdef GRID_OMP
 | 
					#ifdef GRID_OMP
 | 
				
			||||||
@@ -254,7 +213,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  if ( entries[v].valid ) {
 | 
					  if ( entries[v].valid ) {
 | 
				
			||||||
    ret = entries[v].address;
 | 
					    ret = entries[v].address;
 | 
				
			||||||
    cacheBytes -= entries[v].bytes;
 | 
					 | 
				
			||||||
    entries[v].valid = 0;
 | 
					    entries[v].valid = 0;
 | 
				
			||||||
    entries[v].address = NULL;
 | 
					    entries[v].address = NULL;
 | 
				
			||||||
    entries[v].bytes = 0;
 | 
					    entries[v].bytes = 0;
 | 
				
			||||||
@@ -263,7 +221,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 | 
				
			|||||||
  entries[v].address=ptr;
 | 
					  entries[v].address=ptr;
 | 
				
			||||||
  entries[v].bytes  =bytes;
 | 
					  entries[v].bytes  =bytes;
 | 
				
			||||||
  entries[v].valid  =1;
 | 
					  entries[v].valid  =1;
 | 
				
			||||||
  cacheBytes += bytes;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return ret;
 | 
					  return ret;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -273,13 +230,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 | 
				
			|||||||
#ifdef ALLOCATION_CACHE
 | 
					#ifdef ALLOCATION_CACHE
 | 
				
			||||||
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
 | 
					  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
 | 
				
			||||||
  int cache = type+small;
 | 
					  int cache = type+small;
 | 
				
			||||||
  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 | 
					  return Lookup(bytes,Entries[cache],Ncache[cache]);
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
  return NULL;
 | 
					  return NULL;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 | 
					void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  assert(ncache>0);
 | 
					  assert(ncache>0);
 | 
				
			||||||
#ifdef GRID_OMP
 | 
					#ifdef GRID_OMP
 | 
				
			||||||
@@ -288,7 +245,6 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
 | 
				
			|||||||
  for(int e=0;e<ncache;e++){
 | 
					  for(int e=0;e<ncache;e++){
 | 
				
			||||||
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
 | 
					    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
 | 
				
			||||||
      entries[e].valid = 0;
 | 
					      entries[e].valid = 0;
 | 
				
			||||||
      cacheBytes -= entries[e].bytes;
 | 
					 | 
				
			||||||
      return entries[e].address;
 | 
					      return entries[e].address;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -82,15 +82,14 @@ private:
 | 
				
			|||||||
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
 | 
					  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
 | 
				
			||||||
  static int Victim[NallocType];
 | 
					  static int Victim[NallocType];
 | 
				
			||||||
  static int Ncache[NallocType];
 | 
					  static int Ncache[NallocType];
 | 
				
			||||||
  static uint64_t CacheBytes[NallocType];
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /////////////////////////////////////////////////
 | 
					  /////////////////////////////////////////////////
 | 
				
			||||||
  // Free pool
 | 
					  // Free pool
 | 
				
			||||||
  /////////////////////////////////////////////////
 | 
					  /////////////////////////////////////////////////
 | 
				
			||||||
  static void *Insert(void *ptr,size_t bytes,int type) ;
 | 
					  static void *Insert(void *ptr,size_t bytes,int type) ;
 | 
				
			||||||
  static void *Lookup(size_t bytes,int type) ;
 | 
					  static void *Lookup(size_t bytes,int type) ;
 | 
				
			||||||
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
 | 
					  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
 | 
				
			||||||
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
 | 
					  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  static void PrintBytes(void);
 | 
					  static void PrintBytes(void);
 | 
				
			||||||
 public:
 | 
					 public:
 | 
				
			||||||
@@ -114,11 +113,6 @@ private:
 | 
				
			|||||||
  static uint64_t     HostToDeviceXfer;
 | 
					  static uint64_t     HostToDeviceXfer;
 | 
				
			||||||
  static uint64_t     DeviceToHostXfer;
 | 
					  static uint64_t     DeviceToHostXfer;
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
  static uint64_t     DeviceAccesses;
 | 
					 | 
				
			||||||
  static uint64_t     HostAccesses;
 | 
					 | 
				
			||||||
  static uint64_t     DeviceAccessBytes;
 | 
					 | 
				
			||||||
  static uint64_t     HostAccessBytes;
 | 
					 | 
				
			||||||
 
 | 
					 | 
				
			||||||
 private:
 | 
					 private:
 | 
				
			||||||
#ifndef GRID_UVM
 | 
					#ifndef GRID_UVM
 | 
				
			||||||
  //////////////////////////////////////////////////////////////////////
 | 
					  //////////////////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -157,7 +151,6 @@ private:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  //  static void  LRUupdate(AcceleratorViewEntry &AccCache);
 | 
					  //  static void  LRUupdate(AcceleratorViewEntry &AccCache);
 | 
				
			||||||
  static void  LRUinsert(AcceleratorViewEntry &AccCache);
 | 
					  static void  LRUinsert(AcceleratorViewEntry &AccCache);
 | 
				
			||||||
  static void  LRUinsertback(AcceleratorViewEntry &AccCache);
 | 
					 | 
				
			||||||
  static void  LRUremove(AcceleratorViewEntry &AccCache);
 | 
					  static void  LRUremove(AcceleratorViewEntry &AccCache);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  // manage entries in the table
 | 
					  // manage entries in the table
 | 
				
			||||||
@@ -176,7 +169,6 @@ private:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 public:
 | 
					 public:
 | 
				
			||||||
  static void Print(void);
 | 
					  static void Print(void);
 | 
				
			||||||
  static void PrintState( void* CpuPtr);
 | 
					 | 
				
			||||||
  static int   isOpen   (void* CpuPtr);
 | 
					  static int   isOpen   (void* CpuPtr);
 | 
				
			||||||
  static void  ViewClose(void* CpuPtr,ViewMode mode);
 | 
					  static void  ViewClose(void* CpuPtr,ViewMode mode);
 | 
				
			||||||
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 | 
					  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,7 +3,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#warning "Using explicit device memory copies"
 | 
					#warning "Using explicit device memory copies"
 | 
				
			||||||
NAMESPACE_BEGIN(Grid);
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 | 
					//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 | 
				
			||||||
#define dprintf(...)
 | 
					#define dprintf(...)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -23,11 +23,6 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 | 
				
			|||||||
uint64_t  MemoryManager::DeviceToHostBytes;
 | 
					uint64_t  MemoryManager::DeviceToHostBytes;
 | 
				
			||||||
uint64_t  MemoryManager::HostToDeviceXfer;
 | 
					uint64_t  MemoryManager::HostToDeviceXfer;
 | 
				
			||||||
uint64_t  MemoryManager::DeviceToHostXfer;
 | 
					uint64_t  MemoryManager::DeviceToHostXfer;
 | 
				
			||||||
uint64_t  MemoryManager::DeviceAccesses;
 | 
					 | 
				
			||||||
uint64_t  MemoryManager::HostAccesses;
 | 
					 | 
				
			||||||
uint64_t  MemoryManager::DeviceAccessBytes;
 | 
					 | 
				
			||||||
uint64_t  MemoryManager::HostAccessBytes;
 | 
					 | 
				
			||||||
 
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
////////////////////////////////////
 | 
					////////////////////////////////////
 | 
				
			||||||
// Priority ordering for unlocked entries
 | 
					// Priority ordering for unlocked entries
 | 
				
			||||||
@@ -91,14 +86,6 @@ void  MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
 | 
				
			|||||||
  AccCache.LRU_valid = 1;
 | 
					  AccCache.LRU_valid = 1;
 | 
				
			||||||
  DeviceLRUBytes+=AccCache.bytes;
 | 
					  DeviceLRUBytes+=AccCache.bytes;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void  MemoryManager::LRUinsertback(AcceleratorViewEntry &AccCache)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  assert(AccCache.LRU_valid==0);
 | 
					 | 
				
			||||||
  LRU.push_back(AccCache.CpuPtr);
 | 
					 | 
				
			||||||
  AccCache.LRU_entry = --LRU.end();
 | 
					 | 
				
			||||||
  AccCache.LRU_valid = 1;
 | 
					 | 
				
			||||||
  DeviceLRUBytes+=AccCache.bytes;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
void  MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
 | 
					void  MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  assert(AccCache.LRU_valid==1);
 | 
					  assert(AccCache.LRU_valid==1);
 | 
				
			||||||
@@ -142,7 +129,6 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 | 
				
			|||||||
  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
 | 
					  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
 | 
				
			||||||
  assert(AccCache.accLock==0);
 | 
					  assert(AccCache.accLock==0);
 | 
				
			||||||
  assert(AccCache.cpuLock==0);
 | 
					  assert(AccCache.cpuLock==0);
 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  if(AccCache.state==AccDirty) {
 | 
					  if(AccCache.state==AccDirty) {
 | 
				
			||||||
    Flush(AccCache);
 | 
					    Flush(AccCache);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@@ -245,9 +231,6 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
 | 
				
			|||||||
    EntryCreate(CpuPtr,bytes,mode,hint);
 | 
					    EntryCreate(CpuPtr,bytes,mode,hint);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  DeviceAccesses++;
 | 
					 | 
				
			||||||
  DeviceAccessBytes+=bytes;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
					  auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
				
			||||||
  auto & AccCache = AccCacheIterator->second;
 | 
					  auto & AccCache = AccCacheIterator->second;
 | 
				
			||||||
  if (!AccCache.AccPtr) {
 | 
					  if (!AccCache.AccPtr) {
 | 
				
			||||||
@@ -366,10 +349,6 @@ void MemoryManager::CpuViewClose(uint64_t CpuPtr)
 | 
				
			|||||||
  assert(AccCache.accLock==0);
 | 
					  assert(AccCache.accLock==0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  AccCache.cpuLock--;
 | 
					  AccCache.cpuLock--;
 | 
				
			||||||
 | 
					 | 
				
			||||||
  if(AccCache.cpuLock==0) {
 | 
					 | 
				
			||||||
    LRUinsertback(AccCache);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 *  Action  State   StateNext         Flush    Clone
 | 
					 *  Action  State   StateNext         Flush    Clone
 | 
				
			||||||
@@ -392,9 +371,6 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
 | 
				
			|||||||
    EntryCreate(CpuPtr,bytes,mode,transient);
 | 
					    EntryCreate(CpuPtr,bytes,mode,transient);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HostAccesses++;
 | 
					 | 
				
			||||||
  HostAccessBytes+=bytes;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
					  auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
				
			||||||
  auto & AccCache = AccCacheIterator->second;
 | 
					  auto & AccCache = AccCacheIterator->second;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -440,12 +416,6 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  AccCache.transient= transient? EvictNext : 0;
 | 
					  AccCache.transient= transient? EvictNext : 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // If view is opened on host remove from LRU
 | 
					 | 
				
			||||||
  // Host close says evict next from device
 | 
					 | 
				
			||||||
  if(AccCache.LRU_valid==1){
 | 
					 | 
				
			||||||
    LRUremove(AccCache);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  return AccCache.CpuPtr;
 | 
					  return AccCache.CpuPtr;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void  MemoryManager::NotifyDeletion(void *_ptr)
 | 
					void  MemoryManager::NotifyDeletion(void *_ptr)
 | 
				
			||||||
@@ -459,7 +429,6 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
void  MemoryManager::Print(void)
 | 
					void  MemoryManager::Print(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  PrintBytes();
 | 
					 | 
				
			||||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
					  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
				
			||||||
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
 | 
					  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
 | 
				
			||||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
					  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
				
			||||||
@@ -504,32 +473,6 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void MemoryManager::PrintState(void* _CpuPtr)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  if ( EntryPresent(CpuPtr) ){
 | 
					 | 
				
			||||||
    auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
					 | 
				
			||||||
    auto & AccCache = AccCacheIterator->second;
 | 
					 | 
				
			||||||
    std::string str;
 | 
					 | 
				
			||||||
    if ( AccCache.state==Empty    ) str = std::string("Empty");
 | 
					 | 
				
			||||||
    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
 | 
					 | 
				
			||||||
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
 | 
					 | 
				
			||||||
    if ( AccCache.state==Consistent)str = std::string("Consistent");
 | 
					 | 
				
			||||||
    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
 | 
					 | 
				
			||||||
    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
					 | 
				
			||||||
    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 | 
					 | 
				
			||||||
    << "\t" << AccCache.cpuLock
 | 
					 | 
				
			||||||
    << "\t" << AccCache.accLock
 | 
					 | 
				
			||||||
    << "\t" << AccCache.LRU_valid<<std::endl;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl; 
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -12,18 +12,10 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 | 
				
			|||||||
uint64_t  MemoryManager::DeviceToHostBytes;
 | 
					uint64_t  MemoryManager::DeviceToHostBytes;
 | 
				
			||||||
uint64_t  MemoryManager::HostToDeviceXfer;
 | 
					uint64_t  MemoryManager::HostToDeviceXfer;
 | 
				
			||||||
uint64_t  MemoryManager::DeviceToHostXfer;
 | 
					uint64_t  MemoryManager::DeviceToHostXfer;
 | 
				
			||||||
uint64_t  MemoryManager::DeviceAccesses;
 | 
					 | 
				
			||||||
uint64_t  MemoryManager::HostAccesses;
 | 
					 | 
				
			||||||
uint64_t  MemoryManager::DeviceAccessBytes;
 | 
					 | 
				
			||||||
uint64_t  MemoryManager::HostAccessBytes;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 | 
					void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 | 
				
			||||||
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 | 
					void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 | 
				
			||||||
int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
 | 
					int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
 | 
				
			||||||
void  MemoryManager::PrintState(void* CpuPtr)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
void  MemoryManager::Print(void){};
 | 
					void  MemoryManager::Print(void){};
 | 
				
			||||||
void  MemoryManager::NotifyDeletion(void *ptr){};
 | 
					void  MemoryManager::NotifyDeletion(void *ptr){};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -33,8 +33,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
NAMESPACE_BEGIN(Grid);
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
bool Stencil_force_mpi = true;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
///////////////////////////////////////////////////////////////
 | 
					///////////////////////////////////////////////////////////////
 | 
				
			||||||
// Info that is setup once and indept of cartesian layout
 | 
					// Info that is setup once and indept of cartesian layout
 | 
				
			||||||
///////////////////////////////////////////////////////////////
 | 
					///////////////////////////////////////////////////////////////
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -35,8 +35,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
NAMESPACE_BEGIN(Grid);
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern bool Stencil_force_mpi ;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class CartesianCommunicator : public SharedMemory {
 | 
					class CartesianCommunicator : public SharedMemory {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
public:    
 | 
					public:    
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -370,7 +370,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
				
			|||||||
  double off_node_bytes=0.0;
 | 
					  double off_node_bytes=0.0;
 | 
				
			||||||
  int tag;
 | 
					  int tag;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
 | 
					  if ( gfrom ==MPI_UNDEFINED) {
 | 
				
			||||||
    tag= dir+from*32;
 | 
					    tag= dir+from*32;
 | 
				
			||||||
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
 | 
					    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
 | 
				
			||||||
    assert(ierr==0);
 | 
					    assert(ierr==0);
 | 
				
			||||||
@@ -378,18 +378,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
				
			|||||||
    off_node_bytes+=bytes;
 | 
					    off_node_bytes+=bytes;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
 | 
					  if ( gdest == MPI_UNDEFINED ) {
 | 
				
			||||||
    tag= dir+_processor*32;
 | 
					    tag= dir+_processor*32;
 | 
				
			||||||
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
 | 
					    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
 | 
				
			||||||
    assert(ierr==0);
 | 
					    assert(ierr==0);
 | 
				
			||||||
    list.push_back(xrq);
 | 
					    list.push_back(xrq);
 | 
				
			||||||
    off_node_bytes+=bytes;
 | 
					    off_node_bytes+=bytes;
 | 
				
			||||||
  } else {
 | 
					 | 
				
			||||||
    // TODO : make a OMP loop on CPU, call threaded bcopy
 | 
					 | 
				
			||||||
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
 | 
					 | 
				
			||||||
    assert(shm!=NULL);
 | 
					 | 
				
			||||||
    //    std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
 | 
					 | 
				
			||||||
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
 | 
					  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
 | 
				
			||||||
@@ -400,9 +394,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 | 
					void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  //   std::cout << "Copy Synchronised\n"<<std::endl;
 | 
					 | 
				
			||||||
  acceleratorCopySynchronise();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  int nreq=list.size();
 | 
					  int nreq=list.size();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (nreq==0) return;
 | 
					  if (nreq==0) return;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -513,16 +513,26 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
				
			|||||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  // Each MPI rank should allocate our own buffer
 | 
					  // Each MPI rank should allocate our own buffer
 | 
				
			||||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 | 
				
			||||||
 | 
					  auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
 | 
				
			||||||
 | 
					  auto zeContext= cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
 | 
				
			||||||
 | 
					  ze_device_mem_alloc_desc_t zeDesc = {};
 | 
				
			||||||
 | 
					  zeMemAllocDevice(zeContext,&zeDesc,bytes,2*1024*1024,zeDevice,&ShmCommBuf);
 | 
				
			||||||
 | 
					  std::cout << WorldRank << header " SharedMemoryMPI.cc zeMemAllocDevice "<< bytes 
 | 
				
			||||||
 | 
						      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
 | 
				
			||||||
 | 
					#else  
 | 
				
			||||||
  ShmCommBuf = acceleratorAllocDevice(bytes);
 | 
					  ShmCommBuf = acceleratorAllocDevice(bytes);
 | 
				
			||||||
 | 
					#endif  
 | 
				
			||||||
  if (ShmCommBuf == (void *)NULL ) {
 | 
					  if (ShmCommBuf == (void *)NULL ) {
 | 
				
			||||||
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
 | 
					    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
 | 
				
			||||||
    exit(EXIT_FAILURE);  
 | 
					    exit(EXIT_FAILURE);  
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  if ( WorldRank == 0 ){
 | 
					  //  if ( WorldRank == 0 ){
 | 
				
			||||||
 | 
					  if ( 1 ){
 | 
				
			||||||
    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 | 
					    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 | 
				
			||||||
	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
 | 
						      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  SharedMemoryZero(ShmCommBuf,bytes);
 | 
					  //  SharedMemoryZero(ShmCommBuf,bytes);
 | 
				
			||||||
  std::cout<< "Setting up IPC"<<std::endl;
 | 
					  std::cout<< "Setting up IPC"<<std::endl;
 | 
				
			||||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  // Loop over ranks/gpu's on our node
 | 
					  // Loop over ranks/gpu's on our node
 | 
				
			||||||
@@ -533,27 +543,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
				
			|||||||
    //////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////
 | 
				
			||||||
    // If it is me, pass around the IPC access key
 | 
					    // If it is me, pass around the IPC access key
 | 
				
			||||||
    //////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////
 | 
				
			||||||
    void * thisBuf = ShmCommBuf;
 | 
					 | 
				
			||||||
    if(!Stencil_force_mpi) {
 | 
					 | 
				
			||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 | 
					#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 | 
				
			||||||
    typedef struct { int fd; pid_t pid ; } clone_mem_t;
 | 
					    ze_ipc_mem_handle_t handle;
 | 
				
			||||||
 | 
					 | 
				
			||||||
    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
 | 
					 | 
				
			||||||
    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
    ze_ipc_mem_handle_t ihandle;
 | 
					 | 
				
			||||||
    clone_mem_t handle;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if ( r==WorldShmRank ) { 
 | 
					    if ( r==WorldShmRank ) { 
 | 
				
			||||||
      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
 | 
					      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&handle);
 | 
				
			||||||
      if ( err != ZE_RESULT_SUCCESS ) {
 | 
					      if ( err != ZE_RESULT_SUCCESS ) {
 | 
				
			||||||
	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 | 
						std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 | 
				
			||||||
	exit(EXIT_FAILURE);
 | 
						exit(EXIT_FAILURE);
 | 
				
			||||||
      } else {
 | 
					      } else {
 | 
				
			||||||
	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 | 
						std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
 | 
					      std::cerr<<"Allocated IpcHandle rank "<<r<<" (hex) ";
 | 
				
			||||||
      handle.pid = getpid();
 | 
					      for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){
 | 
				
			||||||
 | 
						std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      std::cerr<<std::endl;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GRID_CUDA
 | 
					#ifdef GRID_CUDA
 | 
				
			||||||
@@ -576,7 +580,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
				
			|||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					 | 
				
			||||||
    //////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////
 | 
				
			||||||
    // Share this IPC handle across the Shm Comm
 | 
					    // Share this IPC handle across the Shm Comm
 | 
				
			||||||
    //////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////
 | 
				
			||||||
@@ -592,31 +595,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
				
			|||||||
    ///////////////////////////////////////////////////////////////
 | 
					    ///////////////////////////////////////////////////////////////
 | 
				
			||||||
    // If I am not the source, overwrite thisBuf with remote buffer
 | 
					    // If I am not the source, overwrite thisBuf with remote buffer
 | 
				
			||||||
    ///////////////////////////////////////////////////////////////
 | 
					    ///////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					    void * thisBuf = ShmCommBuf;
 | 
				
			||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 | 
					#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 | 
				
			||||||
    if ( r!=WorldShmRank ) {
 | 
					    if ( r!=WorldShmRank ) {
 | 
				
			||||||
      thisBuf = nullptr;
 | 
					      thisBuf = nullptr;
 | 
				
			||||||
      std::cout<<"mapping seeking remote pid/fd "
 | 
					      std::cerr<<"Using IpcHandle rank "<<r<<" ";
 | 
				
			||||||
	       <<handle.pid<<"/"
 | 
					      for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){
 | 
				
			||||||
	       <<handle.fd<<std::endl;
 | 
						std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
 | 
					      std::cerr<<std::endl;
 | 
				
			||||||
      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
 | 
					      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,handle,0,&thisBuf);
 | 
				
			||||||
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
 | 
					 | 
				
			||||||
      int myfd  = syscall(438,pidfd,handle.fd,0);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
 | 
					 | 
				
			||||||
      if ( err != ZE_RESULT_SUCCESS ) {
 | 
					      if ( err != ZE_RESULT_SUCCESS ) {
 | 
				
			||||||
	std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
 | 
						std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
 | 
				
			||||||
	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 | 
						std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 | 
				
			||||||
	exit(EXIT_FAILURE);
 | 
						exit(EXIT_FAILURE);
 | 
				
			||||||
      } else {
 | 
					      } else {
 | 
				
			||||||
	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
 | 
						std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 | 
				
			||||||
	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
 | 
					 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      assert(thisBuf!=nullptr);
 | 
					      assert(thisBuf!=nullptr);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -642,7 +636,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
				
			|||||||
    ///////////////////////////////////////////////////////////////
 | 
					    ///////////////////////////////////////////////////////////////
 | 
				
			||||||
    // Save a copy of the device buffers
 | 
					    // Save a copy of the device buffers
 | 
				
			||||||
    ///////////////////////////////////////////////////////////////
 | 
					    ///////////////////////////////////////////////////////////////
 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    WorldShmCommBufs[r] = thisBuf;
 | 
					    WorldShmCommBufs[r] = thisBuf;
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
    WorldShmCommBufs[r] = ShmCommBuf;
 | 
					    WorldShmCommBufs[r] = ShmCommBuf;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -225,7 +225,7 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
 | 
				
			|||||||
  autoView( x_v , x, AcceleratorRead);
 | 
					  autoView( x_v , x, AcceleratorRead);
 | 
				
			||||||
  autoView( y_v , y, AcceleratorRead);
 | 
					  autoView( y_v , y, AcceleratorRead);
 | 
				
			||||||
  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
 | 
					  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
 | 
				
			||||||
    auto tmp = a*coalescedRead(x_v[ss])+coalescedRead(y_v[ss]);
 | 
					    auto tmp = a*x_v(ss)+y_v(ss);
 | 
				
			||||||
    coalescedWrite(ret_v[ss],tmp);
 | 
					    coalescedWrite(ret_v[ss],tmp);
 | 
				
			||||||
  });
 | 
					  });
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -88,13 +88,6 @@ public:
 | 
				
			|||||||
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
 | 
					    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
 | 
				
			||||||
    accessor.ViewClose();
 | 
					    accessor.ViewClose();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Helper function to print the state of this object in the AccCache
 | 
					 | 
				
			||||||
  void PrintCacheState(void)
 | 
					 | 
				
			||||||
  {
 | 
					 | 
				
			||||||
    MemoryManager::PrintState(this->_odata);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  /////////////////////////////////////////////////////////////////////////////////
 | 
					  /////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  // Return a view object that may be dereferenced in site loops.
 | 
					  // Return a view object that may be dereferenced in site loops.
 | 
				
			||||||
  // The view is trivially copy constructible and may be copied to an accelerator device
 | 
					  // The view is trivially copy constructible and may be copied to an accelerator device
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -125,7 +125,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	for(int k=k0; k<k1; ++k){
 | 
						for(int k=k0; k<k1; ++k){
 | 
				
			||||||
	  auto tmp = coalescedRead(Bp[ss*nrot+j]);
 | 
						  auto tmp = coalescedRead(Bp[ss*nrot+j]);
 | 
				
			||||||
	  coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_vp[k][sss]));
 | 
						  coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      });
 | 
					      });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -134,7 +134,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 | 
				
			|||||||
	int jj  =j0+j;
 | 
						int jj  =j0+j;
 | 
				
			||||||
	int ss =sj/nrot;
 | 
						int ss =sj/nrot;
 | 
				
			||||||
	int sss=ss+s;
 | 
						int sss=ss+s;
 | 
				
			||||||
	coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
 | 
						coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
 | 
				
			||||||
      });
 | 
					      });
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -361,7 +361,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 | 
				
			|||||||
  // But easily avoided by using double precision fields
 | 
					  // But easily avoided by using double precision fields
 | 
				
			||||||
  ///////////////////////////////////////////////////////
 | 
					  ///////////////////////////////////////////////////////
 | 
				
			||||||
  typedef typename vobj::scalar_object sobj;
 | 
					  typedef typename vobj::scalar_object sobj;
 | 
				
			||||||
  typedef typename vobj::scalar_object::scalar_type scalar_type;
 | 
					 | 
				
			||||||
  GridBase  *grid = Data.Grid();
 | 
					  GridBase  *grid = Data.Grid();
 | 
				
			||||||
  assert(grid!=NULL);
 | 
					  assert(grid!=NULL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -420,19 +419,20 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  // sum over nodes.
 | 
					  // sum over nodes.
 | 
				
			||||||
 | 
					  sobj gsum;
 | 
				
			||||||
  for(int t=0;t<fd;t++){
 | 
					  for(int t=0;t<fd;t++){
 | 
				
			||||||
    int pt = t/ld; // processor plane
 | 
					    int pt = t/ld; // processor plane
 | 
				
			||||||
    int lt = t%ld;
 | 
					    int lt = t%ld;
 | 
				
			||||||
    if ( pt == grid->_processor_coor[orthogdim] ) {
 | 
					    if ( pt == grid->_processor_coor[orthogdim] ) {
 | 
				
			||||||
      result[t]=lsSum[lt];
 | 
					      gsum=lsSum[lt];
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
      result[t]=Zero();
 | 
					      gsum=Zero();
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    grid->GlobalSum(gsum);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    result[t]=gsum;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  scalar_type * ptr = (scalar_type *) &result[0];
 | 
					 | 
				
			||||||
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
 | 
					 | 
				
			||||||
  grid->GlobalSumVector(ptr, words);
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class vobj>
 | 
					template<class vobj>
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -42,6 +42,7 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
 | 
				
			|||||||
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
 | 
					  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
 | 
				
			||||||
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
 | 
					  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
 | 
				
			||||||
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
 | 
					  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
 | 
				
			||||||
 | 
					  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
 | 
				
			||||||
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
 | 
					  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if (warpSize != WARP_SIZE) {
 | 
					  if (warpSize != WARP_SIZE) {
 | 
				
			||||||
@@ -51,10 +52,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
 | 
				
			|||||||
  
 | 
					  
 | 
				
			||||||
  // let the number of threads in a block be a multiple of 2, starting from warpSize
 | 
					  // let the number of threads in a block be a multiple of 2, starting from warpSize
 | 
				
			||||||
  threads = warpSize;
 | 
					  threads = warpSize;
 | 
				
			||||||
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
 | 
					 | 
				
			||||||
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
 | 
					 | 
				
			||||||
    exit(EXIT_FAILURE);
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
 | 
					  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
 | 
				
			||||||
  // keep all the streaming multiprocessors busy
 | 
					  // keep all the streaming multiprocessors busy
 | 
				
			||||||
  blocks = nextPow2(multiProcessorCount);
 | 
					  blocks = nextPow2(multiProcessorCount);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -85,76 +85,6 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
 | 
				
			|||||||
  });
 | 
					  });
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  half.Checkerboard() = cb;
 | 
					 | 
				
			||||||
  autoView(half_v, half, AcceleratorWrite);
 | 
					 | 
				
			||||||
  autoView(full_v, full, AcceleratorRead);
 | 
					 | 
				
			||||||
  Coordinate rdim_full             = full.Grid()->_rdimensions;
 | 
					 | 
				
			||||||
  Coordinate rdim_half             = half.Grid()->_rdimensions;
 | 
					 | 
				
			||||||
  unsigned long ndim_half          = half.Grid()->_ndimension;
 | 
					 | 
				
			||||||
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
 | 
					 | 
				
			||||||
  Coordinate ostride_half          = half.Grid()->_ostride;
 | 
					 | 
				
			||||||
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    Coordinate coor;
 | 
					 | 
				
			||||||
    int cbos;
 | 
					 | 
				
			||||||
    int linear=0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
 | 
					 | 
				
			||||||
    assert(coor.size()==ndim_half);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for(int d=0;d<ndim_half;d++){ 
 | 
					 | 
				
			||||||
      if(checker_dim_mask_half[d]) linear += coor[d];
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    cbos = (linear&0x1);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if (cbos==cb) {
 | 
					 | 
				
			||||||
      int ssh=0;
 | 
					 | 
				
			||||||
      for(int d=0;d<ndim_half;d++) {
 | 
					 | 
				
			||||||
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
 | 
					 | 
				
			||||||
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      coalescedWrite(half_v[ssh],full_v(ss));
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  });
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  int cb = half.Checkerboard();
 | 
					 | 
				
			||||||
  autoView(half_v , half, AcceleratorRead);
 | 
					 | 
				
			||||||
  autoView(full_v , full, AcceleratorWrite);
 | 
					 | 
				
			||||||
  Coordinate rdim_full             = full.Grid()->_rdimensions;
 | 
					 | 
				
			||||||
  Coordinate rdim_half             = half.Grid()->_rdimensions;
 | 
					 | 
				
			||||||
  unsigned long ndim_half          = half.Grid()->_ndimension;
 | 
					 | 
				
			||||||
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
 | 
					 | 
				
			||||||
  Coordinate ostride_half          = half.Grid()->_ostride;
 | 
					 | 
				
			||||||
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Coordinate coor;
 | 
					 | 
				
			||||||
    int cbos;
 | 
					 | 
				
			||||||
    int linear=0;
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
 | 
					 | 
				
			||||||
    assert(coor.size()==ndim_half);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for(int d=0;d<ndim_half;d++){ 
 | 
					 | 
				
			||||||
      if(checker_dim_mask_half[d]) linear += coor[d];
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    cbos = (linear&0x1);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if (cbos==cb) {
 | 
					 | 
				
			||||||
      int ssh=0;
 | 
					 | 
				
			||||||
      for(int d=0;d<ndim_half;d++){
 | 
					 | 
				
			||||||
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
 | 
					 | 
				
			||||||
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
      coalescedWrite(full_v[ss],half_v(ssh));
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  });
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// Flexible Type Conversion for internal promotion to double as well as graceful
 | 
					// Flexible Type Conversion for internal promotion to double as well as graceful
 | 
				
			||||||
// treatment of scalar-compatible types
 | 
					// treatment of scalar-compatible types
 | 
				
			||||||
@@ -434,21 +364,15 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 | 
				
			|||||||
  autoView( coarseData_ , coarseData, AcceleratorWrite);
 | 
					  autoView( coarseData_ , coarseData, AcceleratorWrite);
 | 
				
			||||||
  autoView( fineData_   , fineData, AcceleratorRead);
 | 
					  autoView( fineData_   , fineData, AcceleratorRead);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  auto coarseData_p = &coarseData_[0];
 | 
					 | 
				
			||||||
  auto fineData_p = &fineData_[0];
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  Coordinate fine_rdimensions = fine->_rdimensions;
 | 
					  Coordinate fine_rdimensions = fine->_rdimensions;
 | 
				
			||||||
  Coordinate coarse_rdimensions = coarse->_rdimensions;
 | 
					  Coordinate coarse_rdimensions = coarse->_rdimensions;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  vobj zz = Zero();
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  accelerator_for(sc,coarse->oSites(),1,{
 | 
					  accelerator_for(sc,coarse->oSites(),1,{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // One thread per sub block
 | 
					      // One thread per sub block
 | 
				
			||||||
      Coordinate coor_c(_ndimension);
 | 
					      Coordinate coor_c(_ndimension);
 | 
				
			||||||
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
 | 
					      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
 | 
				
			||||||
 | 
					      coarseData_[sc]=Zero();
 | 
				
			||||||
      vobj cd = zz;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int sb=0;sb<blockVol;sb++){
 | 
					      for(int sb=0;sb<blockVol;sb++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -459,11 +383,9 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 | 
				
			|||||||
	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
 | 
						for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
 | 
				
			||||||
	Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
 | 
						Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	cd=cd+fineData_p[sf];
 | 
						coarseData_[sc]=coarseData_[sc]+fineData_[sf];
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      coarseData_p[sc] = cd;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    });
 | 
					    });
 | 
				
			||||||
  return;
 | 
					  return;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -576,8 +576,6 @@ class ScidacReader : public GridLimeReader {
 | 
				
			|||||||
    std::string rec_name(ILDG_BINARY_DATA);
 | 
					    std::string rec_name(ILDG_BINARY_DATA);
 | 
				
			||||||
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
 | 
					    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
 | 
				
			||||||
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
 | 
					      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
 | 
				
			||||||
  // in principle should do the line below, but that breaks backard compatibility with old data
 | 
					 | 
				
			||||||
  // skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 | 
					 | 
				
			||||||
	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 | 
						skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 | 
				
			||||||
	return;
 | 
						return;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -198,7 +198,7 @@ public:
 | 
				
			|||||||
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
 | 
					      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
 | 
				
			||||||
      exit(0);
 | 
					      exit(0);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
 | 
					    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-1 );
 | 
				
			||||||
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
 | 
					    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
 | 
				
			||||||
    assert(nersc_csum == header.checksum );
 | 
					    assert(nersc_csum == header.checksum );
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -115,9 +115,9 @@ typedef WilsonFermion<WilsonImplR> WilsonFermionR;
 | 
				
			|||||||
typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 | 
					typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 | 
				
			||||||
typedef WilsonFermion<WilsonImplD> WilsonFermionD;
 | 
					typedef WilsonFermion<WilsonImplD> WilsonFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
 | 
					typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
 | 
				
			||||||
//typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
 | 
					typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
 | 
				
			||||||
//typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
 | 
					typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
 | 
					typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
 | 
				
			||||||
typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
 | 
					typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
 | 
				
			||||||
@@ -158,41 +158,41 @@ typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 | 
				
			|||||||
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 | 
					typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 | 
				
			||||||
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 | 
					typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
 | 
					typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
 | 
				
			||||||
//typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
 | 
					typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
 | 
				
			||||||
//typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
 | 
					typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
 | 
					typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
 | 
				
			||||||
typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
 | 
					typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
 | 
				
			||||||
typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
 | 
					typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
 | 
					typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
 | 
				
			||||||
//typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
 | 
					typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
 | 
				
			||||||
//typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
 | 
					typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 | 
					typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 | 
				
			||||||
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 | 
					typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 | 
				
			||||||
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
 | 
					typedef MobiusFermion<WilsonImplD> MobiusFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
 | 
					typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
 | 
				
			||||||
//typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
 | 
					typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
 | 
				
			||||||
//typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
 | 
					typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
 | 
					typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
 | 
				
			||||||
typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
 | 
					typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
 | 
				
			||||||
typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
 | 
					typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
 | 
					typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
 | 
				
			||||||
//typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
 | 
					typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
 | 
				
			||||||
//typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
 | 
					typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 | 
					typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 | 
				
			||||||
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 | 
					typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 | 
				
			||||||
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 | 
					typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
 | 
					typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
 | 
				
			||||||
//typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
 | 
					typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
 | 
				
			||||||
//typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
 | 
					typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Ls vectorised
 | 
					// Ls vectorised
 | 
				
			||||||
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 | 
					typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 | 
				
			||||||
@@ -235,49 +235,49 @@ typedef WilsonFermion<GparityWilsonImplR>     GparityWilsonFermionR;
 | 
				
			|||||||
typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
 | 
					typedef WilsonFermion<GparityWilsonImplF>     GparityWilsonFermionF;
 | 
				
			||||||
typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
 | 
					typedef WilsonFermion<GparityWilsonImplD>     GparityWilsonFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef WilsonFermion<GparityWilsonImplRL>     GparityWilsonFermionRL;
 | 
					typedef WilsonFermion<GparityWilsonImplRL>     GparityWilsonFermionRL;
 | 
				
			||||||
//typedef WilsonFermion<GparityWilsonImplFH>     GparityWilsonFermionFH;
 | 
					typedef WilsonFermion<GparityWilsonImplFH>     GparityWilsonFermionFH;
 | 
				
			||||||
//typedef WilsonFermion<GparityWilsonImplDF>     GparityWilsonFermionDF;
 | 
					typedef WilsonFermion<GparityWilsonImplDF>     GparityWilsonFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 | 
					typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 | 
				
			||||||
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 | 
					typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 | 
				
			||||||
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
 | 
					typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
 | 
					typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
 | 
				
			||||||
//typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
 | 
					typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
 | 
				
			||||||
//typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
 | 
					typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
 | 
					typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
 | 
				
			||||||
typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
 | 
					typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
 | 
				
			||||||
typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
 | 
					typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
 | 
					typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
 | 
				
			||||||
//typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
 | 
					typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
 | 
				
			||||||
//typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
 | 
					typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 | 
					typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 | 
				
			||||||
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 | 
					typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 | 
				
			||||||
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
 | 
					typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
 | 
					typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
 | 
				
			||||||
//typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
 | 
					typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
 | 
				
			||||||
//typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
 | 
					typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
 | 
					typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
 | 
				
			||||||
typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 | 
					typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 | 
				
			||||||
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 | 
					typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
 | 
					typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
 | 
				
			||||||
//typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
 | 
					typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
 | 
				
			||||||
//typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
 | 
					typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
 | 
					typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
 | 
				
			||||||
typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
 | 
					typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
 | 
				
			||||||
typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
 | 
					typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
 | 
					typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
 | 
				
			||||||
//typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
 | 
					typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
 | 
				
			||||||
//typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
 | 
					typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 | 
					typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 | 
				
			||||||
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 | 
					typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -327,8 +327,8 @@ typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> Gparit
 | 
				
			|||||||
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
 | 
					typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
 | 
				
			||||||
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
 | 
					typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
//typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
 | 
					typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
 | 
				
			||||||
//typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
 | 
					typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
 | 
				
			||||||
//typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
 | 
					typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -68,12 +68,11 @@ public:
 | 
				
			|||||||
  /*****************************************************/
 | 
					  /*****************************************************/
 | 
				
			||||||
  /* Compress includes precision change if mpi data is not same */
 | 
					  /* Compress includes precision change if mpi data is not same */
 | 
				
			||||||
  /*****************************************************/
 | 
					  /*****************************************************/
 | 
				
			||||||
  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
 | 
					  template<class _SiteHalfSpinor, class _SiteSpinor>
 | 
				
			||||||
    typedef decltype(coalescedRead(buf)) sobj;
 | 
					  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const {
 | 
				
			||||||
    sobj sp;
 | 
					    _SiteHalfSpinor tmp;
 | 
				
			||||||
    auto sin = coalescedRead(in);
 | 
					    projector::Proj(tmp,in,mu,dag);
 | 
				
			||||||
    projector::Proj(sp,sin,mu,dag);
 | 
					    vstream(buf[o],tmp);
 | 
				
			||||||
    coalescedWrite(buf,sp);
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /*****************************************************/
 | 
					  /*****************************************************/
 | 
				
			||||||
@@ -83,18 +82,13 @@ public:
 | 
				
			|||||||
				   const SiteHalfSpinor * __restrict__ vp0,
 | 
									   const SiteHalfSpinor * __restrict__ vp0,
 | 
				
			||||||
				   const SiteHalfSpinor * __restrict__ vp1,
 | 
									   const SiteHalfSpinor * __restrict__ vp1,
 | 
				
			||||||
				   Integer type,Integer o) const {
 | 
									   Integer type,Integer o) const {
 | 
				
			||||||
#ifdef GRID_SIMT
 | 
					 | 
				
			||||||
    exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
    SiteHalfSpinor tmp1;
 | 
					    SiteHalfSpinor tmp1;
 | 
				
			||||||
    SiteHalfSpinor tmp2;
 | 
					    SiteHalfSpinor tmp2;
 | 
				
			||||||
    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
 | 
					    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
 | 
				
			||||||
    vstream(mp[2*o  ],tmp1);
 | 
					    vstream(mp[2*o  ],tmp1);
 | 
				
			||||||
    vstream(mp[2*o+1],tmp2);
 | 
					    vstream(mp[2*o+1],tmp2);
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
  /*****************************************************/
 | 
					  /*****************************************************/
 | 
				
			||||||
  /* Have a decompression step if mpi data is not same */
 | 
					  /* Have a decompression step if mpi data is not same */
 | 
				
			||||||
  /*****************************************************/
 | 
					  /*****************************************************/
 | 
				
			||||||
@@ -111,28 +105,6 @@ public:
 | 
				
			|||||||
					   const SiteSpinor * __restrict__ in,
 | 
										   const SiteSpinor * __restrict__ in,
 | 
				
			||||||
					   Integer j,Integer k, Integer m,Integer type) const
 | 
										   Integer j,Integer k, Integer m,Integer type) const
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
#ifdef GRID_SIMT
 | 
					 | 
				
			||||||
    typedef SiteSpinor vobj;
 | 
					 | 
				
			||||||
    typedef SiteHalfSpinor hvobj;
 | 
					 | 
				
			||||||
    typedef decltype(coalescedRead(*in))    sobj;
 | 
					 | 
				
			||||||
    typedef decltype(coalescedRead(*out0)) hsobj;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    unsigned int Nsimd = vobj::Nsimd();
 | 
					 | 
				
			||||||
    unsigned int mask = Nsimd >> (type + 1);
 | 
					 | 
				
			||||||
    int lane = acceleratorSIMTlane(Nsimd);
 | 
					 | 
				
			||||||
    int j0 = lane &(~mask); // inner coor zero
 | 
					 | 
				
			||||||
    int j1 = lane |(mask) ; // inner coor one
 | 
					 | 
				
			||||||
    const vobj *vp0 = &in[k];
 | 
					 | 
				
			||||||
    const vobj *vp1 = &in[m];
 | 
					 | 
				
			||||||
    const vobj *vp = (lane&mask) ? vp1:vp0;
 | 
					 | 
				
			||||||
    auto sa = coalescedRead(*vp,j0);
 | 
					 | 
				
			||||||
    auto sb = coalescedRead(*vp,j1);
 | 
					 | 
				
			||||||
    hsobj psa, psb;
 | 
					 | 
				
			||||||
    projector::Proj(psa,sa,mu,dag);
 | 
					 | 
				
			||||||
    projector::Proj(psb,sb,mu,dag);
 | 
					 | 
				
			||||||
    coalescedWrite(out0[j],psa);
 | 
					 | 
				
			||||||
    coalescedWrite(out1[j],psb);
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
    SiteHalfSpinor temp1, temp2;
 | 
					    SiteHalfSpinor temp1, temp2;
 | 
				
			||||||
    SiteHalfSpinor temp3, temp4;
 | 
					    SiteHalfSpinor temp3, temp4;
 | 
				
			||||||
    projector::Proj(temp1,in[k],mu,dag);
 | 
					    projector::Proj(temp1,in[k],mu,dag);
 | 
				
			||||||
@@ -140,7 +112,6 @@ public:
 | 
				
			|||||||
    exchange(temp3,temp4,temp1,temp2,type);
 | 
					    exchange(temp3,temp4,temp1,temp2,type);
 | 
				
			||||||
    vstream(out0[j],temp3);
 | 
					    vstream(out0[j],temp3);
 | 
				
			||||||
    vstream(out1[j],temp4);
 | 
					    vstream(out1[j],temp4);
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /*****************************************************/
 | 
					  /*****************************************************/
 | 
				
			||||||
@@ -150,7 +121,6 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if 0
 | 
					 | 
				
			||||||
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 | 
					template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
 | 
				
			||||||
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
 | 
					class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
 | 
				
			||||||
				typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
 | 
									typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
 | 
				
			||||||
@@ -179,23 +149,13 @@ public:
 | 
				
			|||||||
  /*****************************************************/
 | 
					  /*****************************************************/
 | 
				
			||||||
  /* Compress includes precision change if mpi data is not same */
 | 
					  /* Compress includes precision change if mpi data is not same */
 | 
				
			||||||
  /*****************************************************/
 | 
					  /*****************************************************/
 | 
				
			||||||
  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
 | 
					  template<class _SiteHalfSpinor, class _SiteSpinor>
 | 
				
			||||||
    SiteHalfSpinor hsp;
 | 
					  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const {
 | 
				
			||||||
 | 
					    _SiteHalfSpinor hsp;
 | 
				
			||||||
    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
 | 
					    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
 | 
				
			||||||
    projector::Proj(hsp,in,mu,dag);
 | 
					    projector::Proj(hsp,in,mu,dag);
 | 
				
			||||||
    precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
 | 
					    precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
 | 
					 | 
				
			||||||
#ifdef GRID_SIMT
 | 
					 | 
				
			||||||
    typedef decltype(coalescedRead(buf)) sobj;
 | 
					 | 
				
			||||||
    sobj sp;
 | 
					 | 
				
			||||||
    auto sin = coalescedRead(in);
 | 
					 | 
				
			||||||
    projector::Proj(sp,sin,mu,dag);
 | 
					 | 
				
			||||||
    coalescedWrite(buf,sp);
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
    projector::Proj(buf,in,mu,dag);
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /*****************************************************/
 | 
					  /*****************************************************/
 | 
				
			||||||
  /* Exchange includes precision change if mpi data is not same */
 | 
					  /* Exchange includes precision change if mpi data is not same */
 | 
				
			||||||
@@ -243,7 +203,6 @@ public:
 | 
				
			|||||||
  accelerator_inline bool DecompressionStep(void) const { return true; }
 | 
					  accelerator_inline bool DecompressionStep(void) const { return true; }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DECLARE_PROJ(Projector,Compressor,spProj)			\
 | 
					#define DECLARE_PROJ(Projector,Compressor,spProj)			\
 | 
				
			||||||
  class Projector {							\
 | 
					  class Projector {							\
 | 
				
			||||||
@@ -294,8 +253,33 @@ public:
 | 
				
			|||||||
  typedef typename Base::View_type View_type;
 | 
					  typedef typename Base::View_type View_type;
 | 
				
			||||||
  typedef typename Base::StencilVector StencilVector;
 | 
					  typedef typename Base::StencilVector StencilVector;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  void ZeroCountersi(void)  {  }
 | 
					  double timer0;
 | 
				
			||||||
  void Reporti(int calls)  {  }
 | 
					  double timer1;
 | 
				
			||||||
 | 
					  double timer2;
 | 
				
			||||||
 | 
					  double timer3;
 | 
				
			||||||
 | 
					  double timer4;
 | 
				
			||||||
 | 
					  double timer5;
 | 
				
			||||||
 | 
					  double timer6;
 | 
				
			||||||
 | 
					  uint64_t callsi;
 | 
				
			||||||
 | 
					  void ZeroCountersi(void)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    timer0=0;
 | 
				
			||||||
 | 
					    timer1=0;
 | 
				
			||||||
 | 
					    timer2=0;
 | 
				
			||||||
 | 
					    timer3=0;
 | 
				
			||||||
 | 
					    timer4=0;
 | 
				
			||||||
 | 
					    timer5=0;
 | 
				
			||||||
 | 
					    timer6=0;
 | 
				
			||||||
 | 
					    callsi=0;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  void Reporti(int calls)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
 | 
				
			||||||
 | 
					    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
 | 
				
			||||||
 | 
					    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
 | 
				
			||||||
 | 
					    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
 | 
				
			||||||
 | 
					    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<int> surface_list;
 | 
					  std::vector<int> surface_list;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -337,18 +321,26 @@ public:
 | 
				
			|||||||
  {
 | 
					  {
 | 
				
			||||||
    std::vector<std::vector<CommsRequest_t> > reqs;
 | 
					    std::vector<std::vector<CommsRequest_t> > reqs;
 | 
				
			||||||
    this->HaloExchangeOptGather(source,compress);
 | 
					    this->HaloExchangeOptGather(source,compress);
 | 
				
			||||||
 | 
					    double t1=usecond();
 | 
				
			||||||
    // Asynchronous MPI calls multidirectional, Isend etc...
 | 
					    // Asynchronous MPI calls multidirectional, Isend etc...
 | 
				
			||||||
    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
 | 
					    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
 | 
				
			||||||
    this->Communicate();
 | 
					    this->Communicate();
 | 
				
			||||||
 | 
					    double t2=usecond(); timer1 += t2-t1;
 | 
				
			||||||
    this->CommsMerge(compress);
 | 
					    this->CommsMerge(compress);
 | 
				
			||||||
 | 
					    double t3=usecond(); timer2 += t3-t2;
 | 
				
			||||||
    this->CommsMergeSHM(compress);
 | 
					    this->CommsMergeSHM(compress);
 | 
				
			||||||
 | 
					    double t4=usecond(); timer3 += t4-t3;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  template <class compressor>
 | 
					  template <class compressor>
 | 
				
			||||||
  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
 | 
					  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    this->Prepare();
 | 
					    this->Prepare();
 | 
				
			||||||
 | 
					    double t0=usecond();
 | 
				
			||||||
    this->HaloGatherOpt(source,compress);
 | 
					    this->HaloGatherOpt(source,compress);
 | 
				
			||||||
 | 
					    double t1=usecond();
 | 
				
			||||||
 | 
					    timer0 += t1-t0;
 | 
				
			||||||
 | 
					    callsi++;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template <class compressor>
 | 
					  template <class compressor>
 | 
				
			||||||
@@ -360,9 +352,12 @@ public:
 | 
				
			|||||||
    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
 | 
					    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
 | 
				
			||||||
    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
 | 
					    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    this->mpi3synctime_g-=usecond();
 | 
				
			||||||
    this->_grid->StencilBarrier();
 | 
					    this->_grid->StencilBarrier();
 | 
				
			||||||
 | 
					    this->mpi3synctime_g+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert(source.Grid()==this->_grid);
 | 
					    assert(source.Grid()==this->_grid);
 | 
				
			||||||
 | 
					    this->halogtime-=usecond();
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    this->u_comm_offset=0;
 | 
					    this->u_comm_offset=0;
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
@@ -398,6 +393,7 @@ public:
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    this->face_table_computed=1;
 | 
					    this->face_table_computed=1;
 | 
				
			||||||
    assert(this->u_comm_offset==this->_unified_buffer_size);
 | 
					    assert(this->u_comm_offset==this->_unified_buffer_size);
 | 
				
			||||||
 | 
					    this->halogtime+=usecond();
 | 
				
			||||||
    accelerator_barrier();
 | 
					    accelerator_barrier();
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -243,17 +243,17 @@ typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > WilsonImplR
 | 
				
			|||||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF;  // Float
 | 
					typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF;  // Float
 | 
				
			||||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD;  // Double
 | 
					typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD;  // Double
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL;  // Real.. whichever prec
 | 
					typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL;  // Real.. whichever prec
 | 
				
			||||||
//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH;  // Float
 | 
					typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH;  // Float
 | 
				
			||||||
//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF;  // Double
 | 
					typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF;  // Double
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
 | 
					typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
 | 
				
			||||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
 | 
					typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
 | 
				
			||||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
 | 
					typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
 | 
					typedef WilsonImpl<vComplex,  FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
 | 
				
			||||||
//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
 | 
					typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
 | 
				
			||||||
//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
 | 
					typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
typedef WilsonImpl<vComplex,  AdjointRepresentation, CoeffReal > WilsonAdjImplR;   // Real.. whichever prec
 | 
					typedef WilsonImpl<vComplex,  AdjointRepresentation, CoeffReal > WilsonAdjImplR;   // Real.. whichever prec
 | 
				
			||||||
typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF;  // Float
 | 
					typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF;  // Float
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -828,7 +828,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#if (!defined(GRID_HIP))
 | 
					#if (!defined(GRID_HIP))
 | 
				
			||||||
  int tshift = (mu == Nd-1) ? 1 : 0;
 | 
					  int tshift = (mu == Nd-1) ? 1 : 0;
 | 
				
			||||||
  unsigned int LLt    = GridDefaultLatt()[Tp];
 | 
					 | 
				
			||||||
  ////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////
 | 
				
			||||||
  // GENERAL CAYLEY CASE
 | 
					  // GENERAL CAYLEY CASE
 | 
				
			||||||
  ////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////
 | 
				
			||||||
@@ -881,7 +880,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<RealD> G_s(Ls,1.0);
 | 
					  std::vector<RealD> G_s(Ls,1.0);
 | 
				
			||||||
  RealD sign = 1.0; // sign flip for vector/tadpole
 | 
					  Integer sign = 1; // sign flip for vector/tadpole
 | 
				
			||||||
  if ( curr_type == Current::Axial ) {
 | 
					  if ( curr_type == Current::Axial ) {
 | 
				
			||||||
    for(int s=0;s<Ls/2;s++){
 | 
					    for(int s=0;s<Ls/2;s++){
 | 
				
			||||||
      G_s[s] = -1.0;
 | 
					      G_s[s] = -1.0;
 | 
				
			||||||
@@ -891,7 +890,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
				
			|||||||
    auto b=this->_b;
 | 
					    auto b=this->_b;
 | 
				
			||||||
    auto c=this->_c;
 | 
					    auto c=this->_c;
 | 
				
			||||||
    if ( b == 1 && c == 0 ) {
 | 
					    if ( b == 1 && c == 0 ) {
 | 
				
			||||||
      sign = -1.0;    
 | 
					      sign = -1;    
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    else {
 | 
					    else {
 | 
				
			||||||
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
 | 
					      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
 | 
				
			||||||
@@ -902,8 +901,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
				
			|||||||
  for(int s=0;s<Ls;s++){
 | 
					  for(int s=0;s<Ls;s++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int sp = (s+1)%Ls;
 | 
					    int sp = (s+1)%Ls;
 | 
				
			||||||
    //    int sr = Ls-1-s;
 | 
					    int sr = Ls-1-s;
 | 
				
			||||||
    //    int srp= (sr+1)%Ls;
 | 
					    int srp= (sr+1)%Ls;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Mobius parameters
 | 
					    // Mobius parameters
 | 
				
			||||||
    auto b=this->bs[s];
 | 
					    auto b=this->bs[s];
 | 
				
			||||||
@@ -935,13 +934,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
				
			|||||||
    tmp    = Cshift(tmp,mu,-1);
 | 
					    tmp    = Cshift(tmp,mu,-1);
 | 
				
			||||||
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
 | 
					    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
 | 
				
			||||||
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
 | 
					    tmp = -G_s[s]*( Utmp + gmu*Utmp );
 | 
				
			||||||
    // Mask the time
 | 
					    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
 | 
				
			||||||
    if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
 | 
					 | 
				
			||||||
      unsigned int t0 = 0;
 | 
					 | 
				
			||||||
      tmp    = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
      tmp    = where((lcoor>=tmin+tshift),tmp,zz);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
 | 
					    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    InsertSlice(L_Q, q_out, s , 0);
 | 
					    InsertSlice(L_Q, q_out, s , 0);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -73,17 +73,17 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
@@ -102,17 +102,17 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
@@ -131,17 +131,17 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -165,17 +165,17 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
@@ -194,17 +194,17 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
@@ -223,17 +223,17 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
//template<> void
 | 
					template<> void
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -280,17 +280,17 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
@@ -309,17 +309,17 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
@@ -338,17 +338,17 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/////////////////////////////////////////////////////////////////
 | 
					/////////////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -371,17 +371,17 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
@@ -400,17 +400,17 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
@@ -429,17 +429,17 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
					#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
 | 
				
			||||||
// template<> void
 | 
					template<> void
 | 
				
			||||||
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
// 						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -74,15 +74,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#define INTERIOR
 | 
					#define INTERIOR
 | 
				
			||||||
@@ -97,15 +97,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
@@ -121,15 +121,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
/////////////////////////////////////////////////////////////////
 | 
					/////////////////////////////////////////////////////////////////
 | 
				
			||||||
// XYZT vectorised, dag Kernel, single
 | 
					// XYZT vectorised, dag Kernel, single
 | 
				
			||||||
@@ -148,15 +148,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#define INTERIOR
 | 
					#define INTERIOR
 | 
				
			||||||
@@ -171,15 +171,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#undef INTERIOR
 | 
					#undef INTERIOR
 | 
				
			||||||
@@ -194,15 +194,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
				    
 | 
									    
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//
 | 
									    
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
				    
 | 
									    
 | 
				
			||||||
#undef MAYBEPERM
 | 
					#undef MAYBEPERM
 | 
				
			||||||
#undef MULT_2SPIN
 | 
					#undef MULT_2SPIN
 | 
				
			||||||
@@ -228,14 +228,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeF
 | 
				
			|||||||
							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#define INTERIOR
 | 
					#define INTERIOR
 | 
				
			||||||
@@ -249,14 +249,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGau
 | 
				
			|||||||
							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#undef INTERIOR
 | 
					#undef INTERIOR
 | 
				
			||||||
@@ -273,15 +273,15 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGau
 | 
				
			|||||||
							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
				    
 | 
									    
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//
 | 
									    
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
				    
 | 
									    
 | 
				
			||||||
/////////////////////////////////////////////////////////////////
 | 
					/////////////////////////////////////////////////////////////////
 | 
				
			||||||
// Ls vectorised, dag Kernel, single
 | 
					// Ls vectorised, dag Kernel, single
 | 
				
			||||||
@@ -299,14 +299,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGau
 | 
				
			|||||||
							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#define INTERIOR
 | 
					#define INTERIOR
 | 
				
			||||||
@@ -320,14 +320,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, Doubled
 | 
				
			|||||||
							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#undef INTERIOR
 | 
					#undef INTERIOR
 | 
				
			||||||
@@ -341,14 +341,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
 | 
				
			|||||||
							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif  // VEC 5D
 | 
					#endif  // VEC 5D
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -392,14 +392,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#define INTERIOR
 | 
					#define INTERIOR
 | 
				
			||||||
@@ -413,14 +413,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#undef INTERIOR
 | 
					#undef INTERIOR
 | 
				
			||||||
@@ -434,14 +434,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
/////////////////////////////////////////////////////////////////
 | 
					/////////////////////////////////////////////////////////////////
 | 
				
			||||||
// XYZT vectorised, dag Kernel, single
 | 
					// XYZT vectorised, dag Kernel, single
 | 
				
			||||||
@@ -459,14 +459,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#define INTERIOR
 | 
					#define INTERIOR
 | 
				
			||||||
@@ -480,14 +480,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#undef INTERIOR
 | 
					#undef INTERIOR
 | 
				
			||||||
@@ -501,14 +501,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
 | 
				
			|||||||
						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
				    
 | 
									    
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//						int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
											int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
				    
 | 
									    
 | 
				
			||||||
#undef MAYBEPERM
 | 
					#undef MAYBEPERM
 | 
				
			||||||
#undef MULT_2SPIN
 | 
					#undef MULT_2SPIN
 | 
				
			||||||
@@ -533,14 +533,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeF
 | 
				
			|||||||
							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#define INTERIOR
 | 
					#define INTERIOR
 | 
				
			||||||
@@ -554,14 +554,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGau
 | 
				
			|||||||
							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#undef INTERIOR
 | 
					#undef INTERIOR
 | 
				
			||||||
@@ -577,14 +577,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGau
 | 
				
			|||||||
							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
				    
 | 
									    
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
 | 
				
			||||||
//							 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												 int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
				    
 | 
									    
 | 
				
			||||||
/////////////////////////////////////////////////////////////////
 | 
					/////////////////////////////////////////////////////////////////
 | 
				
			||||||
// Ls vectorised, dag Kernel, single
 | 
					// Ls vectorised, dag Kernel, single
 | 
				
			||||||
@@ -602,14 +602,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGau
 | 
				
			|||||||
							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#define INTERIOR
 | 
					#define INTERIOR
 | 
				
			||||||
@@ -623,14 +623,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, Doubled
 | 
				
			|||||||
							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef INTERIOR_AND_EXTERIOR
 | 
					#undef INTERIOR_AND_EXTERIOR
 | 
				
			||||||
#undef INTERIOR
 | 
					#undef INTERIOR
 | 
				
			||||||
@@ -645,14 +645,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, Doubled
 | 
				
			|||||||
							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
//template<> void
 | 
					template<> void 
 | 
				
			||||||
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
//							    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
												    int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif  // VEC 5D
 | 
					#endif  // VEC 5D
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#define REGISTER
 | 
					#define REGISTER
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef GRID_SIMT
 | 
					#ifdef GRID_SIMT
 | 
				
			||||||
#define LOAD_CHIMU(Ptype)		\
 | 
					#define LOAD_CHIMU(ptype)		\
 | 
				
			||||||
  {const SiteSpinor & ref (in[offset]);	\
 | 
					  {const SiteSpinor & ref (in[offset]);	\
 | 
				
			||||||
    Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane);	\
 | 
					    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
 | 
				
			||||||
    Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane);		\
 | 
					    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
 | 
				
			||||||
    Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane);		\
 | 
					    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
 | 
				
			||||||
    Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane);		\
 | 
					    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
 | 
				
			||||||
    Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane);		\
 | 
					    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
 | 
				
			||||||
    Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane);		\
 | 
					    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
 | 
				
			||||||
    Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane);		\
 | 
					    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
 | 
				
			||||||
    Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane);		\
 | 
					    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
 | 
				
			||||||
    Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane);		\
 | 
					    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
 | 
				
			||||||
    Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane);		\
 | 
					    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
 | 
				
			||||||
    Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane);		\
 | 
					    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
 | 
				
			||||||
    Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane);	}
 | 
					    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
 | 
				
			||||||
#define PERMUTE_DIR(dir) ;
 | 
					#define PERMUTE_DIR(dir) ;
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
#define LOAD_CHIMU(Ptype)		\
 | 
					#define LOAD_CHIMU(ptype)		\
 | 
				
			||||||
  {const SiteSpinor & ref (in[offset]);	\
 | 
					  {const SiteSpinor & ref (in[offset]);	\
 | 
				
			||||||
    Chimu_00=ref()(0)(0);\
 | 
					    Chimu_00=ref()(0)(0);\
 | 
				
			||||||
    Chimu_01=ref()(0)(1);\
 | 
					    Chimu_01=ref()(0)(1);\
 | 
				
			||||||
@@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
    Chimu_32=ref()(3)(2);}
 | 
					    Chimu_32=ref()(3)(2);}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define PERMUTE_DIR(dir)			\
 | 
					#define PERMUTE_DIR(dir)			\
 | 
				
			||||||
  permute##dir(Chi_00,Chi_00);			\
 | 
					  permute##dir(Chi_00,Chi_00);	\
 | 
				
			||||||
  permute##dir(Chi_01,Chi_01);			\
 | 
					      permute##dir(Chi_01,Chi_01);\
 | 
				
			||||||
  permute##dir(Chi_02,Chi_02);			\
 | 
					      permute##dir(Chi_02,Chi_02);\
 | 
				
			||||||
  permute##dir(Chi_10,Chi_10);			\
 | 
					      permute##dir(Chi_10,Chi_10);	\
 | 
				
			||||||
  permute##dir(Chi_11,Chi_11);			\
 | 
					      permute##dir(Chi_11,Chi_11);\
 | 
				
			||||||
  permute##dir(Chi_12,Chi_12);
 | 
					      permute##dir(Chi_12,Chi_12);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -371,91 +371,88 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
  result_32-= UChi_12;
 | 
					  result_32-= UChi_12;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
 | 
					#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
 | 
				
			||||||
  {int ptype;					\
 | 
					  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
				
			||||||
   SE=st.GetEntry(ptype,DIR,ss);		\
 | 
					  offset = SE->_offset;				\
 | 
				
			||||||
   auto offset = SE->_offset;			\
 | 
					  local  = SE->_is_local;			\
 | 
				
			||||||
   auto local  = SE->_is_local;			\
 | 
					  perm   = SE->_permute;			\
 | 
				
			||||||
   auto perm   = SE->_permute;			\
 | 
					  if ( local ) {				\
 | 
				
			||||||
   if ( local ) {				\
 | 
					    LOAD_CHIMU(PERM);				\
 | 
				
			||||||
     LOAD_CHIMU(PERM);				\
 | 
					    PROJ;					\
 | 
				
			||||||
     PROJ;					\
 | 
					    if ( perm) {				\
 | 
				
			||||||
     if ( perm) {				\
 | 
					      PERMUTE_DIR(PERM);			\
 | 
				
			||||||
       PERMUTE_DIR(PERM);			\
 | 
					    }						\
 | 
				
			||||||
     }						\
 | 
					  } else {					\
 | 
				
			||||||
   } else {					\
 | 
					    LOAD_CHI;					\
 | 
				
			||||||
     LOAD_CHI;					\
 | 
					  }						\
 | 
				
			||||||
   }						\
 | 
					  acceleratorSynchronise();			\
 | 
				
			||||||
   acceleratorSynchronise();			\
 | 
					  MULT_2SPIN(DIR);				\
 | 
				
			||||||
   MULT_2SPIN(DIR);				\
 | 
					  RECON;					
 | 
				
			||||||
   RECON;					}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)		\
 | 
					#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
 | 
				
			||||||
  { SE=&st_p[DIR+8*ss];						\
 | 
					  SE=&st_p[DIR+8*ss];				\
 | 
				
			||||||
  auto ptype=st_perm[DIR];					\
 | 
					  ptype=st_perm[DIR];				\
 | 
				
			||||||
  auto offset = SE->_offset;					\
 | 
					  offset = SE->_offset;				\
 | 
				
			||||||
  auto local  = SE->_is_local;					\
 | 
					  local  = SE->_is_local;			\
 | 
				
			||||||
  auto perm   = SE->_permute;					\
 | 
					  perm   = SE->_permute;			\
 | 
				
			||||||
  if ( local ) {						\
 | 
					  if ( local ) {				\
 | 
				
			||||||
    LOAD_CHIMU(PERM);						\
 | 
					    LOAD_CHIMU(PERM);				\
 | 
				
			||||||
    PROJ;							\
 | 
					    PROJ;					\
 | 
				
			||||||
    if ( perm) {						\
 | 
					    if ( perm) {				\
 | 
				
			||||||
      PERMUTE_DIR(PERM);					\
 | 
					      PERMUTE_DIR(PERM);			\
 | 
				
			||||||
    }								\
 | 
					    }						\
 | 
				
			||||||
  } else {							\
 | 
					  } else {					\
 | 
				
			||||||
    LOAD_CHI;							\
 | 
					    LOAD_CHI;					\
 | 
				
			||||||
  }								\
 | 
					  }						\
 | 
				
			||||||
  acceleratorSynchronise();					\
 | 
					  acceleratorSynchronise();			\
 | 
				
			||||||
  MULT_2SPIN(DIR);						\
 | 
					  MULT_2SPIN(DIR);				\
 | 
				
			||||||
  RECON;					}
 | 
					  RECON;					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
 | 
					#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
 | 
				
			||||||
  { SE=&st_p[DIR+8*ss];							\
 | 
					  SE=&st_p[DIR+8*ss];							\
 | 
				
			||||||
    auto ptype=st_perm[DIR];						\
 | 
					  ptype=st_perm[DIR];							\
 | 
				
			||||||
    /*SE=st.GetEntry(ptype,DIR,ss);*/					\
 | 
					 /*SE=st.GetEntry(ptype,DIR,ss);*/					\
 | 
				
			||||||
    auto offset = SE->_offset;						\
 | 
					  offset = SE->_offset;				\
 | 
				
			||||||
    auto perm   = SE->_permute;						\
 | 
					  perm   = SE->_permute;			\
 | 
				
			||||||
    LOAD_CHIMU(PERM);							\
 | 
					  LOAD_CHIMU(PERM);				\
 | 
				
			||||||
    PROJ;								\
 | 
					  PROJ;						\
 | 
				
			||||||
    MULT_2SPIN(DIR);							\
 | 
					  MULT_2SPIN(DIR);				\
 | 
				
			||||||
    RECON;					}
 | 
					  RECON;					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
 | 
					#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
 | 
				
			||||||
  { int ptype;						\
 | 
					  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
				
			||||||
  SE=st.GetEntry(ptype,DIR,ss);				\
 | 
					  offset = SE->_offset;				\
 | 
				
			||||||
  auto offset = SE->_offset;					\
 | 
					  local  = SE->_is_local;			\
 | 
				
			||||||
  auto local  = SE->_is_local;					\
 | 
					  perm   = SE->_permute;			\
 | 
				
			||||||
  auto perm   = SE->_permute;					\
 | 
					  if ( local ) {				\
 | 
				
			||||||
  if ( local ) {						\
 | 
					    LOAD_CHIMU(PERM);				\
 | 
				
			||||||
    LOAD_CHIMU(PERM);						\
 | 
					    PROJ;					\
 | 
				
			||||||
    PROJ;							\
 | 
					    if ( perm) {				\
 | 
				
			||||||
    if ( perm) {						\
 | 
					      PERMUTE_DIR(PERM);			\
 | 
				
			||||||
      PERMUTE_DIR(PERM);					\
 | 
					    }						\
 | 
				
			||||||
    }								\
 | 
					  } else if ( st.same_node[DIR] ) {		\
 | 
				
			||||||
  } else if ( st.same_node[DIR] ) {				\
 | 
					    LOAD_CHI;					\
 | 
				
			||||||
    LOAD_CHI;							\
 | 
					  }						\
 | 
				
			||||||
  }								\
 | 
					  acceleratorSynchronise();			\
 | 
				
			||||||
  acceleratorSynchronise();					\
 | 
					  if (local || st.same_node[DIR] ) {		\
 | 
				
			||||||
  if (local || st.same_node[DIR] ) {				\
 | 
					    MULT_2SPIN(DIR);				\
 | 
				
			||||||
    MULT_2SPIN(DIR);						\
 | 
					    RECON;					\
 | 
				
			||||||
    RECON;							\
 | 
					  }						\
 | 
				
			||||||
  }								\
 | 
					  acceleratorSynchronise();			
 | 
				
			||||||
  acceleratorSynchronise();			}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
 | 
					#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
 | 
				
			||||||
  { int ptype;						\
 | 
					  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
				
			||||||
  SE=st.GetEntry(ptype,DIR,ss);				\
 | 
					  offset = SE->_offset;				\
 | 
				
			||||||
  auto offset = SE->_offset;				\
 | 
					  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
 | 
				
			||||||
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {		\
 | 
					    LOAD_CHI;					\
 | 
				
			||||||
    LOAD_CHI;						\
 | 
					    MULT_2SPIN(DIR);				\
 | 
				
			||||||
    MULT_2SPIN(DIR);					\
 | 
					    RECON;					\
 | 
				
			||||||
    RECON;						\
 | 
					    nmu++;					\
 | 
				
			||||||
    nmu++;						\
 | 
					  }						\
 | 
				
			||||||
  }							\
 | 
					  acceleratorSynchronise();			
 | 
				
			||||||
  acceleratorSynchronise();			}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define HAND_RESULT(ss)					\
 | 
					#define HAND_RESULT(ss)				\
 | 
				
			||||||
  {							\
 | 
					  {						\
 | 
				
			||||||
    SiteSpinor & ref (out[ss]);				\
 | 
					    SiteSpinor & ref (out[ss]);			\
 | 
				
			||||||
    coalescedWrite(ref()(0)(0),result_00,lane);		\
 | 
					    coalescedWrite(ref()(0)(0),result_00,lane);		\
 | 
				
			||||||
    coalescedWrite(ref()(0)(1),result_01,lane);		\
 | 
					    coalescedWrite(ref()(0)(1),result_01,lane);		\
 | 
				
			||||||
    coalescedWrite(ref()(0)(2),result_02,lane);		\
 | 
					    coalescedWrite(ref()(0)(2),result_02,lane);		\
 | 
				
			||||||
@@ -566,6 +563,7 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  HAND_DECLARATIONS(Simt);
 | 
					  HAND_DECLARATIONS(Simt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int offset,local,perm, ptype;
 | 
				
			||||||
  StencilEntry *SE;
 | 
					  StencilEntry *SE;
 | 
				
			||||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
 | 
					  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
 | 
				
			||||||
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
					  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
				
			||||||
@@ -595,7 +593,9 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  HAND_DECLARATIONS(Simt);
 | 
					  HAND_DECLARATIONS(Simt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int offset,local,perm, ptype;
 | 
				
			||||||
  StencilEntry *SE;
 | 
					  StencilEntry *SE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
 | 
					  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
 | 
				
			||||||
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
					  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
				
			||||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
 | 
					  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
 | 
				
			||||||
@@ -623,6 +623,8 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
 | 
				
			|||||||
  HAND_DECLARATIONS(Simt);
 | 
					  HAND_DECLARATIONS(Simt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  StencilEntry *SE;
 | 
					  StencilEntry *SE;
 | 
				
			||||||
 | 
					  int offset,local,perm, ptype;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
 | 
					  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
 | 
				
			||||||
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
					  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
				
			||||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
 | 
					  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
 | 
				
			||||||
@@ -638,8 +640,8 @@ template<class Impl>  accelerator_inline void
 | 
				
			|||||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 | 
					WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 | 
				
			||||||
					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 | 
										  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  //  auto st_p = st._entries_p;						
 | 
					  auto st_p = st._entries_p;						
 | 
				
			||||||
  //  auto st_perm = st._permute_type;					
 | 
					  auto st_perm = st._permute_type;					
 | 
				
			||||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
  typedef typename Simd::scalar_type S;
 | 
					  typedef typename Simd::scalar_type S;
 | 
				
			||||||
  typedef typename Simd::vector_type V;
 | 
					  typedef typename Simd::vector_type V;
 | 
				
			||||||
@@ -650,6 +652,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  HAND_DECLARATIONS(Simt);
 | 
					  HAND_DECLARATIONS(Simt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int offset,local,perm, ptype;
 | 
				
			||||||
  StencilEntry *SE;
 | 
					  StencilEntry *SE;
 | 
				
			||||||
  ZERO_RESULT;
 | 
					  ZERO_RESULT;
 | 
				
			||||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
 | 
					  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
 | 
				
			||||||
@@ -667,8 +670,8 @@ template<class Impl> accelerator_inline
 | 
				
			|||||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 | 
											  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  //  auto st_p = st._entries_p;						
 | 
					  auto st_p = st._entries_p;						
 | 
				
			||||||
  //  auto st_perm = st._permute_type;					
 | 
					  auto st_perm = st._permute_type;					
 | 
				
			||||||
  typedef typename Simd::scalar_type S;
 | 
					  typedef typename Simd::scalar_type S;
 | 
				
			||||||
  typedef typename Simd::vector_type V;
 | 
					  typedef typename Simd::vector_type V;
 | 
				
			||||||
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
 | 
					  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
 | 
				
			||||||
@@ -679,6 +682,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
 | 
				
			|||||||
  HAND_DECLARATIONS(Simt);
 | 
					  HAND_DECLARATIONS(Simt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  StencilEntry *SE;
 | 
					  StencilEntry *SE;
 | 
				
			||||||
 | 
					  int offset,local,perm, ptype;
 | 
				
			||||||
  ZERO_RESULT;
 | 
					  ZERO_RESULT;
 | 
				
			||||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
 | 
					  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
 | 
				
			||||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
					  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
				
			||||||
@@ -695,8 +699,8 @@ template<class Impl>  accelerator_inline void
 | 
				
			|||||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 | 
					WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 | 
				
			||||||
					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 | 
										  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  //  auto st_p = st._entries_p;						
 | 
					  auto st_p = st._entries_p;						
 | 
				
			||||||
  //  auto st_perm = st._permute_type;					
 | 
					  auto st_perm = st._permute_type;					
 | 
				
			||||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
  typedef typename Simd::scalar_type S;
 | 
					  typedef typename Simd::scalar_type S;
 | 
				
			||||||
  typedef typename Simd::vector_type V;
 | 
					  typedef typename Simd::vector_type V;
 | 
				
			||||||
@@ -707,7 +711,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  HAND_DECLARATIONS(Simt);
 | 
					  HAND_DECLARATIONS(Simt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //  int offset, ptype;
 | 
					  int offset, ptype;
 | 
				
			||||||
  StencilEntry *SE;
 | 
					  StencilEntry *SE;
 | 
				
			||||||
  int nmu=0;
 | 
					  int nmu=0;
 | 
				
			||||||
  ZERO_RESULT;
 | 
					  ZERO_RESULT;
 | 
				
			||||||
@@ -726,8 +730,8 @@ template<class Impl>  accelerator_inline
 | 
				
			|||||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
					void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 | 
				
			||||||
						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 | 
											  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  //  auto st_p = st._entries_p;						
 | 
					  auto st_p = st._entries_p;						
 | 
				
			||||||
  //  auto st_perm = st._permute_type;					
 | 
					  auto st_perm = st._permute_type;					
 | 
				
			||||||
  typedef typename Simd::scalar_type S;
 | 
					  typedef typename Simd::scalar_type S;
 | 
				
			||||||
  typedef typename Simd::vector_type V;
 | 
					  typedef typename Simd::vector_type V;
 | 
				
			||||||
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
 | 
					  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
 | 
				
			||||||
@@ -738,7 +742,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
 | 
				
			|||||||
  HAND_DECLARATIONS(Simt);
 | 
					  HAND_DECLARATIONS(Simt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  StencilEntry *SE;
 | 
					  StencilEntry *SE;
 | 
				
			||||||
  //  int offset, ptype;
 | 
					  int offset, ptype;
 | 
				
			||||||
  int nmu=0;
 | 
					  int nmu=0;
 | 
				
			||||||
  ZERO_RESULT;
 | 
					  ZERO_RESULT;
 | 
				
			||||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
 | 
					  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../CayleyFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../ContinuedFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../DomainWallEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../MobiusEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../PartialFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonCloverFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonKernelsInstantiationGparity.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonTMFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					#define IMPLEMENTATION GparityWilsonImplDF
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../CayleyFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../ContinuedFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../DomainWallEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../MobiusEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../PartialFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonCloverFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonKernelsInstantiationGparity.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonTMFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					#define IMPLEMENTATION GparityWilsonImplFH
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../CayleyFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../ContinuedFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../DomainWallEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../MobiusEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../PartialFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonCloverFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonFermionInstantiation.cc.master
 | 
				
			||||||
@@ -2,11 +2,14 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
Grid physics library, www.github.com/paboyle/Grid
 | 
					Grid physics library, www.github.com/paboyle/Grid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Source file: ./lib/serialisation/BaseIO.h
 | 
					Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Copyright (C) 2015
 | 
					Copyright (C) 2015, 2020
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Author: Michael Marshall <michael.marshall@ed.ac.uk>
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This program is free software; you can redistribute it and/or modify
 | 
					This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
it under the terms of the GNU General Public License as published by
 | 
					it under the terms of the GNU General Public License as published by
 | 
				
			||||||
@@ -22,14 +25,27 @@ You should have received a copy of the GNU General Public License along
 | 
				
			|||||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
					with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
					51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
					See the full license in the file "LICENSE" in the top level distribution
 | 
				
			||||||
 | 
					directory
 | 
				
			||||||
*************************************************************************************/
 | 
					*************************************************************************************/
 | 
				
			||||||
/*  END LEGAL */
 | 
					/*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <Grid/GridCore.h>
 | 
					#ifndef AVX512
 | 
				
			||||||
 | 
					#ifndef QPX
 | 
				
			||||||
 | 
					#ifndef A64FX
 | 
				
			||||||
 | 
					#ifndef A64FXFIXEDSIZE
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMESPACE_BEGIN(Grid)
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
std::uint64_t EigenIO::EigenResizeCounter(0);
 | 
					#include "impl.h"
 | 
				
			||||||
 | 
					template class WilsonKernels<IMPLEMENTATION>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMESPACE_END(Grid)
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonTMFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					#define IMPLEMENTATION WilsonImplDF
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../CayleyFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../ContinuedFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../DomainWallEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../MobiusEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../PartialFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonCloverFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1,51 @@
 | 
				
			|||||||
 | 
					/*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Grid physics library, www.github.com/paboyle/Grid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Copyright (C) 2015, 2020
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					(at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					See the full license in the file "LICENSE" in the top level distribution
 | 
				
			||||||
 | 
					directory
 | 
				
			||||||
 | 
					*************************************************************************************/
 | 
				
			||||||
 | 
					/*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef AVX512
 | 
				
			||||||
 | 
					#ifndef QPX
 | 
				
			||||||
 | 
					#ifndef A64FX
 | 
				
			||||||
 | 
					#ifndef A64FXFIXEDSIZE
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "impl.h"
 | 
				
			||||||
 | 
					template class WilsonKernels<IMPLEMENTATION>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonTMFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					#define IMPLEMENTATION WilsonImplFH
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../CayleyFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../ContinuedFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../DomainWallEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../MobiusEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../PartialFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1,51 @@
 | 
				
			|||||||
 | 
					/*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Grid physics library, www.github.com/paboyle/Grid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Copyright (C) 2015, 2020
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					(at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					See the full license in the file "LICENSE" in the top level distribution
 | 
				
			||||||
 | 
					directory
 | 
				
			||||||
 | 
					*************************************************************************************/
 | 
				
			||||||
 | 
					/*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef AVX512
 | 
				
			||||||
 | 
					#ifndef QPX
 | 
				
			||||||
 | 
					#ifndef A64FX
 | 
				
			||||||
 | 
					#ifndef A64FXFIXEDSIZE
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "impl.h"
 | 
				
			||||||
 | 
					template class WilsonKernels<IMPLEMENTATION>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					#define IMPLEMENTATION ZWilsonImplDF
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../CayleyFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../ContinuedFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../DomainWallEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../MobiusEOFAFermionInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../PartialFractionFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../WilsonFermion5DInstantiation.cc.master
 | 
				
			||||||
@@ -0,0 +1,51 @@
 | 
				
			|||||||
 | 
					/*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Grid physics library, www.github.com/paboyle/Grid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Copyright (C) 2015, 2020
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					(at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					See the full license in the file "LICENSE" in the top level distribution
 | 
				
			||||||
 | 
					directory
 | 
				
			||||||
 | 
					*************************************************************************************/
 | 
				
			||||||
 | 
					/*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef AVX512
 | 
				
			||||||
 | 
					#ifndef QPX
 | 
				
			||||||
 | 
					#ifndef A64FX
 | 
				
			||||||
 | 
					#ifndef A64FXFIXEDSIZE
 | 
				
			||||||
 | 
					#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "impl.h"
 | 
				
			||||||
 | 
					template class WilsonKernels<IMPLEMENTATION>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					#define IMPLEMENTATION ZWilsonImplFH
 | 
				
			||||||
@@ -9,6 +9,8 @@ STAG5_IMPL_LIST=""
 | 
				
			|||||||
WILSON_IMPL_LIST=" \
 | 
					WILSON_IMPL_LIST=" \
 | 
				
			||||||
	   WilsonImplF \
 | 
						   WilsonImplF \
 | 
				
			||||||
	   WilsonImplD \
 | 
						   WilsonImplD \
 | 
				
			||||||
 | 
						   WilsonImplFH \
 | 
				
			||||||
 | 
						   WilsonImplDF \
 | 
				
			||||||
	   WilsonAdjImplF \
 | 
						   WilsonAdjImplF \
 | 
				
			||||||
	   WilsonAdjImplD \
 | 
						   WilsonAdjImplD \
 | 
				
			||||||
	   WilsonTwoIndexSymmetricImplF \
 | 
						   WilsonTwoIndexSymmetricImplF \
 | 
				
			||||||
@@ -16,17 +18,26 @@ WILSON_IMPL_LIST=" \
 | 
				
			|||||||
	   WilsonTwoIndexAntiSymmetricImplF \
 | 
						   WilsonTwoIndexAntiSymmetricImplF \
 | 
				
			||||||
	   WilsonTwoIndexAntiSymmetricImplD \
 | 
						   WilsonTwoIndexAntiSymmetricImplD \
 | 
				
			||||||
	   GparityWilsonImplF \
 | 
						   GparityWilsonImplF \
 | 
				
			||||||
	   GparityWilsonImplD "
 | 
						   GparityWilsonImplD \
 | 
				
			||||||
 | 
						   GparityWilsonImplFH \
 | 
				
			||||||
 | 
						   GparityWilsonImplDF"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DWF_IMPL_LIST=" \
 | 
					DWF_IMPL_LIST=" \
 | 
				
			||||||
	   WilsonImplF \
 | 
						   WilsonImplF \
 | 
				
			||||||
	   WilsonImplD \
 | 
						   WilsonImplD \
 | 
				
			||||||
 | 
						   WilsonImplFH \
 | 
				
			||||||
 | 
						   WilsonImplDF \
 | 
				
			||||||
	   ZWilsonImplF \
 | 
						   ZWilsonImplF \
 | 
				
			||||||
	   ZWilsonImplD "
 | 
						   ZWilsonImplD \
 | 
				
			||||||
 | 
						   ZWilsonImplFH \
 | 
				
			||||||
 | 
						   ZWilsonImplDF "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
GDWF_IMPL_LIST=" \
 | 
					GDWF_IMPL_LIST=" \
 | 
				
			||||||
	   GparityWilsonImplF \
 | 
						   GparityWilsonImplF \
 | 
				
			||||||
	   GparityWilsonImplD "
 | 
						   GparityWilsonImplD \
 | 
				
			||||||
 | 
						   GparityWilsonImplFH \
 | 
				
			||||||
 | 
						   GparityWilsonImplDF"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
IMPL_LIST="$STAG_IMPL_LIST  $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST"
 | 
					IMPL_LIST="$STAG_IMPL_LIST  $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user