mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Compare commits
	
		
			114 Commits
		
	
	
		
			feature/ei
			...
			feature/a2
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					d8c0c0ba0a | ||
| 
						 | 
					c6cf918d4c | ||
| 
						 | 
					6d0a907c5c | ||
| 
						 | 
					3276aa67dc | ||
| 
						 | 
					7cf7f11e1a | ||
| f1f655d92b | |||
| 43334e88c3 | |||
| 4f1e66b044 | |||
| 64fe5b21b4 | |||
| 
						 | 
					ee9889821d | ||
| eb470aa6dc | |||
| 77af9a3ddc | |||
| 102089798c | |||
| 39cea8b5a7 | |||
| a65f66d2db | |||
| 
						 | 
					936c5ecf69 | ||
| 
						 | 
					22cfbdbbb3 | ||
| 
						 | 
					093d1ee21b | ||
| 
						 | 
					d6ba2581ce | ||
| 
						 | 
					577c064184 | ||
| 
						 | 
					2ff1fa6fad | ||
| 
						 | 
					70be1bd8be | ||
| 4ef50ba31f | |||
| 3e97a26f90 | |||
| 599f28f6ef | |||
| 
						 | 
					c48da35921 | ||
| 
						 | 
					6c5fa8dcd8 | ||
| 
						 | 
					0d2f913a1a | ||
| 
						 | 
					1a74816c25 | ||
| 
						 | 
					73de335256 | ||
| 
						 | 
					228fd450ce | ||
| 
						 | 
					b949cf6b12 | ||
| 
						 | 
					11bc1aeadc | ||
| 
						 | 
					66005929af | ||
| 
						 | 
					ff7c847735 | ||
| 
						 | 
					1aa988b2af | ||
| 
						 | 
					edf17708a8 | ||
| 
						 | 
					f46f029dbb | ||
| 
						 | 
					3dccd7aa2c | ||
| 
						 | 
					65e6e7da6f | ||
| 
						 | 
					b5e87e8d97 | ||
| 
						 | 
					5f5807d60a | ||
| 
						 | 
					7974acff54 | ||
| f0d17d2b49 | |||
| 244c003a1b | |||
| 0174f5f742 | |||
| 
						 | 
					32b2b59be4 | ||
| 
						 | 
					86bb0cc24b | ||
| 
						 | 
					84c19587e7 | ||
| 
						 | 
					237ce92540 | ||
| 
						 | 
					a7ffc61e82 | ||
| 
						 | 
					fd97f64612 | ||
| 
						 | 
					8720aecb80 | ||
| 
						 | 
					cdf0a04fc5 | ||
| 
						 | 
					616d3dd737 | ||
| 
						 | 
					8b066baca8 | ||
| 
						 | 
					e97f3688db | ||
| 
						 | 
					89a1e78390 | ||
| 
						 | 
					ffbb3fc02c | ||
| 
						 | 
					5a73ef3647 | ||
| 
						 | 
					87e5d2f4b7 | ||
| 
						 | 
					d720f10758 | ||
| 
						 | 
					14fcd0912a | ||
| 
						 | 
					3111c0bd4f | ||
| 
						 | 
					e03064490e | ||
| 
						 | 
					1a4c8c3387 | ||
| 
						 | 
					2b1e259441 | ||
| 
						 | 
					f39c2a240b | ||
| 
						 | 
					0d95805cde | ||
| 
						 | 
					f67830587f | ||
| 
						 | 
					6bf7f839ff | ||
| 
						 | 
					e3147881a9 | ||
| 
						 | 
					fb559614ad | ||
| 
						 | 
					e93e12b6a4 | ||
| 
						 | 
					0c3112cd94 | ||
| 
						 | 
					8cfd5d2639 | ||
| 
						 | 
					1c9f20b15e | ||
| 
						 | 
					32237895bd | ||
| 
						 | 
					9fcb47ee63 | ||
| 
						 | 
					1d252d0922 | ||
| 
						 | 
					006cc8a8f1 | ||
| 
						 | 
					cf2938688a | ||
| 
						 | 
					ee63721bad | ||
| 
						 | 
					22c5168d70 | ||
| 
						 | 
					949ac3cd24 | ||
| 
						 | 
					7bc0166c1c | ||
| 
						 | 
					cb0d1b3399 | ||
| 
						 | 
					d1f1ccc705 | ||
| 
						 | 
					c7519a237a | ||
| 
						 | 
					32be2b13d3 | ||
| 
						 | 
					92b342a477 | ||
| 
						 | 
					556da86ac3 | ||
| 
						 | 
					8285e41574 | ||
| 
						 | 
					f999408e92 | ||
| 
						 | 
					a7abda89e2 | ||
| 
						 | 
					7860a50f70 | ||
| 
						 | 
					6c6812a5ca | ||
| 
						 | 
					8358ee38c4 | ||
| 
						 | 
					1f154fe652 | ||
| 
						 | 
					d708c0258d | ||
| 
						 | 
					a7635fd5ba | ||
| 
						 | 
					ebb60330c9 | ||
| 
						 | 
					32fbdf4fb1 | ||
| 
						 | 
					a9847aa866 | ||
| 
						 | 
					d24d8e8398 | ||
| 
						 | 
					162e4bb567 | ||
| 
						 | 
					07c0c02f8c | ||
| 
						 | 
					8c31c065b5 | ||
| 
						 | 
					b1c86900b2 | ||
| 
						 | 
					bbbee5660d | ||
| 
						 | 
					52081acfa5 | ||
| 
						 | 
					f8b8e00090 | ||
| 
						 | 
					28a1fcaaff | ||
| 2c22db841a | 
@@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/perfmon/PerfCount.h>
 | 
			
		||||
#include <Grid/util/Util.h>
 | 
			
		||||
#include <Grid/log/Log.h>
 | 
			
		||||
#include <Grid/allocator/AlignedAllocator.h>
 | 
			
		||||
#include <Grid/allocator/Allocator.h>
 | 
			
		||||
#include <Grid/simd/Simd.h>
 | 
			
		||||
#include <Grid/threads/Threads.h>
 | 
			
		||||
#include <Grid/threads/ThreadReduction.h>
 | 
			
		||||
#include <Grid/serialisation/Serialisation.h>
 | 
			
		||||
#include <Grid/util/Sha.h>
 | 
			
		||||
#include <Grid/communicator/Communicator.h> 
 | 
			
		||||
 
 | 
			
		||||
@@ -6,6 +6,7 @@
 | 
			
		||||
///////////////////
 | 
			
		||||
#include <cassert>
 | 
			
		||||
#include <complex>
 | 
			
		||||
#include <memory>
 | 
			
		||||
#include <vector>
 | 
			
		||||
#include <array>
 | 
			
		||||
#include <string>
 | 
			
		||||
 
 | 
			
		||||
@@ -18,19 +18,20 @@
 | 
			
		||||
#pragma push_macro("__CUDA_ARCH__")
 | 
			
		||||
#pragma push_macro("__NVCC__")
 | 
			
		||||
#pragma push_macro("__CUDACC__")
 | 
			
		||||
#undef __CUDA_ARCH__
 | 
			
		||||
#undef __NVCC__
 | 
			
		||||
#undef __CUDACC__
 | 
			
		||||
#undef __CUDA_ARCH__
 | 
			
		||||
#define __NVCC__REDEFINE__
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
/* SYCL save and restore compile environment*/
 | 
			
		||||
#ifdef __SYCL_DEVICE_ONLY__  
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
#pragma push
 | 
			
		||||
#pragma push_macro("__SYCL_DEVICE_ONLY__")
 | 
			
		||||
#undef __SYCL_DEVICE_ONLY__
 | 
			
		||||
#undef EIGEN_USE_SYCL
 | 
			
		||||
#define EIGEN_DONT_VECTORIZE
 | 
			
		||||
//#undef EIGEN_USE_SYCL
 | 
			
		||||
#define __SYCL__REDEFINE__
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -41,7 +42,7 @@
 | 
			
		||||
#ifdef __NVCC__REDEFINE__
 | 
			
		||||
#pragma pop_macro("__CUDACC__")
 | 
			
		||||
#pragma pop_macro("__NVCC__")
 | 
			
		||||
#pragma pop_macro("__CUDA_ARCH__")
 | 
			
		||||
#pragma pop_macro("GRID_SIMT")
 | 
			
		||||
#pragma pop
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -21,7 +21,7 @@ if BUILD_HDF5
 | 
			
		||||
  extra_headers+=serialisation/Hdf5Type.h
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
all: version-cache
 | 
			
		||||
all: version-cache Version.h
 | 
			
		||||
 | 
			
		||||
version-cache:
 | 
			
		||||
	@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
 | 
			
		||||
@@ -42,7 +42,7 @@ version-cache:
 | 
			
		||||
	fi;\
 | 
			
		||||
	rm -f vertmp
 | 
			
		||||
 | 
			
		||||
Version.h:
 | 
			
		||||
Version.h: version-cache
 | 
			
		||||
	cp version-cache Version.h
 | 
			
		||||
 | 
			
		||||
.PHONY: version-cache
 | 
			
		||||
 
 | 
			
		||||
@@ -29,9 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifndef GRID_ALGORITHMS_H
 | 
			
		||||
#define GRID_ALGORITHMS_H
 | 
			
		||||
 | 
			
		||||
NAMESPACE_CHECK(algorithms);
 | 
			
		||||
#include <Grid/algorithms/SparseMatrix.h>
 | 
			
		||||
#include <Grid/algorithms/LinearOperator.h>
 | 
			
		||||
#include <Grid/algorithms/Preconditioner.h>
 | 
			
		||||
NAMESPACE_CHECK(SparseMatrix);
 | 
			
		||||
 | 
			
		||||
#include <Grid/algorithms/approx/Zolotarev.h>
 | 
			
		||||
#include <Grid/algorithms/approx/Chebyshev.h>
 | 
			
		||||
@@ -41,10 +43,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/algorithms/approx/Forecast.h>
 | 
			
		||||
#include <Grid/algorithms/approx/RemezGeneral.h>
 | 
			
		||||
#include <Grid/algorithms/approx/ZMobius.h>
 | 
			
		||||
 | 
			
		||||
NAMESPACE_CHECK(approx);
 | 
			
		||||
#include <Grid/algorithms/iterative/Deflation.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
 | 
			
		||||
NAMESPACE_CHECK(ConjGrad);
 | 
			
		||||
#include <Grid/algorithms/iterative/BiCGSTAB.h>
 | 
			
		||||
NAMESPACE_CHECK(BiCGSTAB);
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateResidual.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/NormalEquations.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/SchurRedBlack.h>
 | 
			
		||||
@@ -62,7 +66,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/PowerMethod.h>
 | 
			
		||||
 | 
			
		||||
NAMESPACE_CHECK(PowerMethod);
 | 
			
		||||
#include <Grid/algorithms/CoarsenedMatrix.h>
 | 
			
		||||
NAMESPACE_CHECK(CoarsendMatrix);
 | 
			
		||||
#include <Grid/algorithms/FFT.h>
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -1,14 +1,3 @@
 | 
			
		||||
    // blockZaxpy in bockPromote - 3s, 5%
 | 
			
		||||
    // noncoalesced linalg in Preconditionoer ~ 3s 5%
 | 
			
		||||
    // Lancos tuning or replace 10-20s ~ 25%, open ended
 | 
			
		||||
    // setup tuning   5s  ~  8%
 | 
			
		||||
    //    -- e.g. ordermin, orderstep tunables.
 | 
			
		||||
    // MdagM path without norm in LinOp code.     few seconds
 | 
			
		||||
 | 
			
		||||
    // Mdir calc blocking kernels
 | 
			
		||||
    // Fuse kernels in blockMaskedInnerProduct
 | 
			
		||||
    // preallocate Vectors in Cayley 5D ~ few percent few seconds
 | 
			
		||||
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
@@ -91,34 +80,7 @@ public:
 | 
			
		||||
    }
 | 
			
		||||
    directions   [2*_d]=0;
 | 
			
		||||
    displacements[2*_d]=0;
 | 
			
		||||
      
 | 
			
		||||
    //// report back
 | 
			
		||||
    std::cout<<GridLogMessage<<"directions    :";
 | 
			
		||||
    for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
 | 
			
		||||
    std::cout<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"displacements :";
 | 
			
		||||
    for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
 | 
			
		||||
    std::cout<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  /*
 | 
			
		||||
  // Original cleaner code
 | 
			
		||||
  Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
 | 
			
		||||
  for(int d=0;d<dimension;d++){
 | 
			
		||||
  directions[2*d  ] = d;
 | 
			
		||||
  directions[2*d+1] = d;
 | 
			
		||||
  displacements[2*d  ] = +1;
 | 
			
		||||
  displacements[2*d+1] = -1;
 | 
			
		||||
  }
 | 
			
		||||
  directions   [2*dimension]=0;
 | 
			
		||||
  displacements[2*dimension]=0;
 | 
			
		||||
  }
 | 
			
		||||
  std::vector<int> GetDelta(int point) {
 | 
			
		||||
  std::vector<int> delta(dimension,0);
 | 
			
		||||
  delta[directions[point]] = displacements[point];
 | 
			
		||||
  return delta;
 | 
			
		||||
  };
 | 
			
		||||
  */    
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
  
 | 
			
		||||
@@ -149,25 +111,7 @@ public:
 | 
			
		||||
    CoarseScalar InnerProd(CoarseGrid); 
 | 
			
		||||
    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
 | 
			
		||||
    blockOrthogonalise(InnerProd,subspace);
 | 
			
		||||
    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
 | 
			
		||||
    //    blockOrthogonalise(InnerProd,subspace);
 | 
			
		||||
    //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
 | 
			
		||||
    //      CheckOrthogonal();
 | 
			
		||||
  } 
 | 
			
		||||
  void CheckOrthogonal(void){
 | 
			
		||||
    CoarseVector iProj(CoarseGrid); 
 | 
			
		||||
    CoarseVector eProj(CoarseGrid); 
 | 
			
		||||
    for(int i=0;i<nbasis;i++){
 | 
			
		||||
      blockProject(iProj,subspace[i],subspace);
 | 
			
		||||
      eProj=Zero(); 
 | 
			
		||||
      accelerator_for(ss, CoarseGrid->oSites(),1,{
 | 
			
		||||
	eProj[ss](i)=CComplex(1.0);
 | 
			
		||||
      });
 | 
			
		||||
      eProj=eProj - iProj;
 | 
			
		||||
      std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
 | 
			
		||||
    blockProject(CoarseVec,FineVec,subspace);
 | 
			
		||||
  }
 | 
			
		||||
@@ -175,11 +119,6 @@ public:
 | 
			
		||||
    FineVec.Checkerboard() = subspace[0].Checkerboard();
 | 
			
		||||
    blockPromote(CoarseVec,FineVec,subspace);
 | 
			
		||||
  }
 | 
			
		||||
  void CreateSubspaceRandom(GridParallelRNG &RNG){
 | 
			
		||||
    for(int i=0;i<nbasis;i++){
 | 
			
		||||
      random(RNG,subspace[i]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
 | 
			
		||||
 | 
			
		||||
@@ -190,12 +129,12 @@ public:
 | 
			
		||||
    FineField Mn(FineGrid);
 | 
			
		||||
 | 
			
		||||
    for(int b=0;b<nn;b++){
 | 
			
		||||
	
 | 
			
		||||
      
 | 
			
		||||
      subspace[b] = Zero();
 | 
			
		||||
      gaussian(RNG,noise);
 | 
			
		||||
      scale = std::pow(norm2(noise),-0.5); 
 | 
			
		||||
      noise=noise*scale;
 | 
			
		||||
	
 | 
			
		||||
      
 | 
			
		||||
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<1;i++){
 | 
			
		||||
@@ -218,7 +157,7 @@ public:
 | 
			
		||||
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
 | 
			
		||||
  // and this is the best I found
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#if 1
 | 
			
		||||
 | 
			
		||||
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 | 
			
		||||
				       int nn,
 | 
			
		||||
				       double hi,
 | 
			
		||||
@@ -280,10 +219,10 @@ public:
 | 
			
		||||
	
 | 
			
		||||
	hermop.HermOp(*Tn,y);
 | 
			
		||||
 | 
			
		||||
	auto y_v = y.View();
 | 
			
		||||
	auto Tn_v = Tn->View();
 | 
			
		||||
	auto Tnp_v = Tnp->View();
 | 
			
		||||
	auto Tnm_v = Tnm->View();
 | 
			
		||||
	autoView( y_v , y, AcceleratorWrite);
 | 
			
		||||
	autoView( Tn_v , (*Tn), AcceleratorWrite);
 | 
			
		||||
	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
 | 
			
		||||
	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
 | 
			
		||||
	const int Nsimd = CComplex::Nsimd();
 | 
			
		||||
	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
 | 
			
		||||
	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 | 
			
		||||
@@ -313,201 +252,6 @@ public:
 | 
			
		||||
    }
 | 
			
		||||
    assert(b==nn);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
#if 0
 | 
			
		||||
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 | 
			
		||||
				       int nn,
 | 
			
		||||
				       double hi,
 | 
			
		||||
				       double lo,
 | 
			
		||||
				       int orderfilter,
 | 
			
		||||
				       int ordermin,
 | 
			
		||||
				       int orderstep,
 | 
			
		||||
				       double filterlo
 | 
			
		||||
				       ) {
 | 
			
		||||
 | 
			
		||||
    RealD scale;
 | 
			
		||||
 | 
			
		||||
    FineField noise(FineGrid);
 | 
			
		||||
    FineField Mn(FineGrid);
 | 
			
		||||
    FineField tmp(FineGrid);
 | 
			
		||||
    FineField combined(FineGrid);
 | 
			
		||||
 | 
			
		||||
    // New normalised noise
 | 
			
		||||
    gaussian(RNG,noise);
 | 
			
		||||
    scale = std::pow(norm2(noise),-0.5); 
 | 
			
		||||
    noise=noise*scale;
 | 
			
		||||
 | 
			
		||||
    // Initial matrix element
 | 
			
		||||
    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
 | 
			
		||||
 | 
			
		||||
    int b =0;
 | 
			
		||||
#define FILTERb(llo,hhi,oorder)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
 | 
			
		||||
      Cheb(hermop,noise,Mn);						\
 | 
			
		||||
      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
 | 
			
		||||
      subspace[b]   = Mn;						\
 | 
			
		||||
      hermop.Op(Mn,tmp);						\
 | 
			
		||||
      std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
 | 
			
		||||
      b++;								\
 | 
			
		||||
    }									
 | 
			
		||||
 | 
			
		||||
    //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\
 | 
			
		||||
 | 
			
		||||
    RealD alpha=-0.8;
 | 
			
		||||
    RealD beta =-0.8;
 | 
			
		||||
#define FILTER(llo,hhi,oorder)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
 | 
			
		||||
      /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
 | 
			
		||||
      Cheb(hermop,noise,Mn);						\
 | 
			
		||||
      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
 | 
			
		||||
      subspace[b]   = Mn;						\
 | 
			
		||||
      hermop.Op(Mn,tmp);						\
 | 
			
		||||
      std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
 | 
			
		||||
      b++;								\
 | 
			
		||||
    }									
 | 
			
		||||
    
 | 
			
		||||
#define FILTERc(llo,hhi,oorder)				\
 | 
			
		||||
    {							\
 | 
			
		||||
      Chebyshev<FineField> Cheb(llo,hhi,oorder);	\
 | 
			
		||||
      Cheb(hermop,noise,combined);			\
 | 
			
		||||
    }									
 | 
			
		||||
 | 
			
		||||
    double node = 0.000;
 | 
			
		||||
    FILTERb(lo,hi,orderfilter);// 0
 | 
			
		||||
    //    FILTERc(node,hi,51);// 0
 | 
			
		||||
    noise = Mn;
 | 
			
		||||
    int base = 0;
 | 
			
		||||
    int mult = 100;
 | 
			
		||||
    FILTER(node,hi,base+1*mult);
 | 
			
		||||
    FILTER(node,hi,base+2*mult);
 | 
			
		||||
    FILTER(node,hi,base+3*mult);
 | 
			
		||||
    FILTER(node,hi,base+4*mult);
 | 
			
		||||
    FILTER(node,hi,base+5*mult);
 | 
			
		||||
    FILTER(node,hi,base+6*mult);
 | 
			
		||||
    FILTER(node,hi,base+7*mult);
 | 
			
		||||
    FILTER(node,hi,base+8*mult);
 | 
			
		||||
    FILTER(node,hi,base+9*mult);
 | 
			
		||||
    FILTER(node,hi,base+10*mult);
 | 
			
		||||
    FILTER(node,hi,base+11*mult);
 | 
			
		||||
    FILTER(node,hi,base+12*mult);
 | 
			
		||||
    FILTER(node,hi,base+13*mult);
 | 
			
		||||
    FILTER(node,hi,base+14*mult);
 | 
			
		||||
    FILTER(node,hi,base+15*mult);
 | 
			
		||||
    assert(b==nn);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 | 
			
		||||
				       int nn,
 | 
			
		||||
				       double hi,
 | 
			
		||||
				       double lo,
 | 
			
		||||
				       int orderfilter,
 | 
			
		||||
				       int ordermin,
 | 
			
		||||
				       int orderstep,
 | 
			
		||||
				       double filterlo
 | 
			
		||||
				       ) {
 | 
			
		||||
 | 
			
		||||
    RealD scale;
 | 
			
		||||
 | 
			
		||||
    FineField noise(FineGrid);
 | 
			
		||||
    FineField Mn(FineGrid);
 | 
			
		||||
    FineField tmp(FineGrid);
 | 
			
		||||
    FineField combined(FineGrid);
 | 
			
		||||
 | 
			
		||||
    // New normalised noise
 | 
			
		||||
    gaussian(RNG,noise);
 | 
			
		||||
    scale = std::pow(norm2(noise),-0.5); 
 | 
			
		||||
    noise=noise*scale;
 | 
			
		||||
 | 
			
		||||
    // Initial matrix element
 | 
			
		||||
    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
 | 
			
		||||
 | 
			
		||||
    int b =0;
 | 
			
		||||
    {						
 | 
			
		||||
      Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
 | 
			
		||||
      //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
 | 
			
		||||
      //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
 | 
			
		||||
      //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
 | 
			
		||||
      JacobiPoly(hermop,noise,Mn);
 | 
			
		||||
      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
 | 
			
		||||
      subspace[b]   = Mn;
 | 
			
		||||
      hermop.Op(Mn,tmp);
 | 
			
		||||
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; 
 | 
			
		||||
      b++;
 | 
			
		||||
      //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
 | 
			
		||||
      //      subspace[b]   = tmp;      b++;
 | 
			
		||||
      //    }									
 | 
			
		||||
    }									
 | 
			
		||||
 | 
			
		||||
#define FILTER(lambda)						\
 | 
			
		||||
    {								\
 | 
			
		||||
      hermop.HermOp(subspace[0],tmp);				\
 | 
			
		||||
      tmp = tmp - lambda *subspace[0];				\
 | 
			
		||||
      scale = std::pow(norm2(tmp),-0.5);			\
 | 
			
		||||
      tmp=tmp*scale;							\
 | 
			
		||||
      subspace[b]   = tmp;						\
 | 
			
		||||
      hermop.Op(subspace[b],tmp);					\
 | 
			
		||||
      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
 | 
			
		||||
      b++;								\
 | 
			
		||||
    }									
 | 
			
		||||
    //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
 | 
			
		||||
    //      subspace[b]   = tmp;      b++;
 | 
			
		||||
    //    }									
 | 
			
		||||
 | 
			
		||||
    FILTER(2.0e-5);
 | 
			
		||||
    FILTER(2.0e-4);
 | 
			
		||||
    FILTER(4.0e-4);
 | 
			
		||||
    FILTER(8.0e-4);
 | 
			
		||||
    FILTER(8.0e-4);
 | 
			
		||||
 | 
			
		||||
    FILTER(2.0e-3);
 | 
			
		||||
    FILTER(3.0e-3);
 | 
			
		||||
    FILTER(4.0e-3);
 | 
			
		||||
    FILTER(5.0e-3);
 | 
			
		||||
    FILTER(6.0e-3);
 | 
			
		||||
 | 
			
		||||
    FILTER(2.5e-3);
 | 
			
		||||
    FILTER(3.5e-3);
 | 
			
		||||
    FILTER(4.5e-3);
 | 
			
		||||
    FILTER(5.5e-3);
 | 
			
		||||
    FILTER(6.5e-3);
 | 
			
		||||
 | 
			
		||||
    //    FILTER(6.0e-5);//6
 | 
			
		||||
    //    FILTER(7.0e-5);//8
 | 
			
		||||
    //    FILTER(8.0e-5);//9
 | 
			
		||||
    //    FILTER(9.0e-5);//3
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    //    FILTER(1.0e-4);//10
 | 
			
		||||
    FILTER(2.0e-4);//11
 | 
			
		||||
    //   FILTER(3.0e-4);//12
 | 
			
		||||
    //    FILTER(4.0e-4);//13
 | 
			
		||||
    FILTER(5.0e-4);//14
 | 
			
		||||
 | 
			
		||||
    FILTER(6.0e-3);//4
 | 
			
		||||
    FILTER(7.0e-4);//1
 | 
			
		||||
    FILTER(8.0e-4);//7
 | 
			
		||||
    FILTER(9.0e-4);//15
 | 
			
		||||
    FILTER(1.0e-3);//2
 | 
			
		||||
 | 
			
		||||
    FILTER(2.0e-3);//2
 | 
			
		||||
    FILTER(3.0e-3);//2
 | 
			
		||||
    FILTER(4.0e-3);//2
 | 
			
		||||
    FILTER(5.0e-3);//2
 | 
			
		||||
    FILTER(6.0e-3);//2
 | 
			
		||||
 | 
			
		||||
    FILTER(7.0e-3);//2
 | 
			
		||||
    FILTER(8.0e-3);//2
 | 
			
		||||
    FILTER(1.0e-2);//2
 | 
			
		||||
    */
 | 
			
		||||
    std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
 | 
			
		||||
    assert(b==nn);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
@@ -535,7 +279,7 @@ public:
 | 
			
		||||
  CartesianStencil<siteVector,siteVector,int> Stencil; 
 | 
			
		||||
 | 
			
		||||
  std::vector<CoarseMatrix> A;
 | 
			
		||||
      
 | 
			
		||||
    
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
  // Interface
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
@@ -549,13 +293,13 @@ public:
 | 
			
		||||
    SimpleCompressor<siteVector> compressor;
 | 
			
		||||
 | 
			
		||||
    Stencil.HaloExchange(in,compressor);
 | 
			
		||||
 | 
			
		||||
    auto in_v = in.View();
 | 
			
		||||
    auto out_v = out.View();
 | 
			
		||||
    autoView( in_v , in, AcceleratorRead);
 | 
			
		||||
    autoView( out_v , out, AcceleratorWrite);
 | 
			
		||||
    typedef LatticeView<Cobj> Aview;
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
    Vector<Aview> AcceleratorViewContainer;
 | 
			
		||||
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
 | 
			
		||||
  
 | 
			
		||||
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
 | 
			
		||||
    Aview *Aview_p = & AcceleratorViewContainer[0];
 | 
			
		||||
 | 
			
		||||
    const int Nsimd = CComplex::Nsimd();
 | 
			
		||||
@@ -572,24 +316,25 @@ public:
 | 
			
		||||
      int ptype;
 | 
			
		||||
      StencilEntry *SE;
 | 
			
		||||
 | 
			
		||||
      int lane=SIMTlane(Nsimd);
 | 
			
		||||
      for(int point=0;point<geom.npoint;point++){
 | 
			
		||||
 | 
			
		||||
	SE=Stencil.GetEntry(ptype,point,ss);
 | 
			
		||||
	  
 | 
			
		||||
	if(SE->_is_local) { 
 | 
			
		||||
	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
 | 
			
		||||
	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 | 
			
		||||
	} else {
 | 
			
		||||
	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
 | 
			
		||||
	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
 | 
			
		||||
	}
 | 
			
		||||
	synchronise();
 | 
			
		||||
	acceleratorSynchronise();
 | 
			
		||||
 | 
			
		||||
	for(int bb=0;bb<nbasis;bb++) {
 | 
			
		||||
	  res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      coalescedWrite(out_v[ss](b),res,lane);
 | 
			
		||||
    });
 | 
			
		||||
      coalescedWrite(out_v[ss](b),res);
 | 
			
		||||
      });
 | 
			
		||||
 | 
			
		||||
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  void Mdag (const CoarseVector &in, CoarseVector &out)
 | 
			
		||||
@@ -617,11 +362,11 @@ public:
 | 
			
		||||
 | 
			
		||||
    typedef LatticeView<Cobj> Aview;
 | 
			
		||||
    Vector<Aview> AcceleratorViewContainer;
 | 
			
		||||
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
 | 
			
		||||
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
 | 
			
		||||
    Aview *Aview_p = & AcceleratorViewContainer[0];
 | 
			
		||||
 | 
			
		||||
    auto out_v = out.View();
 | 
			
		||||
    auto in_v  = in.View();
 | 
			
		||||
    autoView( out_v , out, AcceleratorWrite);
 | 
			
		||||
    autoView( in_v  , in, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
    const int Nsimd = CComplex::Nsimd();
 | 
			
		||||
    typedef decltype(coalescedRead(in_v[0])) calcVector;
 | 
			
		||||
@@ -635,45 +380,21 @@ public:
 | 
			
		||||
      int ptype;
 | 
			
		||||
      StencilEntry *SE;
 | 
			
		||||
 | 
			
		||||
      int lane=SIMTlane(Nsimd);
 | 
			
		||||
      SE=Stencil.GetEntry(ptype,point,ss);
 | 
			
		||||
	  
 | 
			
		||||
      if(SE->_is_local) { 
 | 
			
		||||
	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
 | 
			
		||||
	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 | 
			
		||||
      } else {
 | 
			
		||||
	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
 | 
			
		||||
	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
 | 
			
		||||
      }
 | 
			
		||||
      synchronise();
 | 
			
		||||
      acceleratorSynchronise();
 | 
			
		||||
 | 
			
		||||
      for(int bb=0;bb<nbasis;bb++) {
 | 
			
		||||
	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 | 
			
		||||
      }
 | 
			
		||||
      coalescedWrite(out_v[ss](b),res,lane);
 | 
			
		||||
      coalescedWrite(out_v[ss](b),res);
 | 
			
		||||
    });
 | 
			
		||||
#if 0
 | 
			
		||||
    accelerator_for(ss,Grid()->oSites(),1,{
 | 
			
		||||
 | 
			
		||||
      siteVector res = Zero();
 | 
			
		||||
      siteVector nbr;
 | 
			
		||||
      int ptype;
 | 
			
		||||
      StencilEntry *SE;
 | 
			
		||||
      
 | 
			
		||||
      SE=Stencil.GetEntry(ptype,point,ss);
 | 
			
		||||
      
 | 
			
		||||
      if(SE->_is_local&&SE->_permute) {
 | 
			
		||||
	permute(nbr,in_v[SE->_offset],ptype);
 | 
			
		||||
      } else if(SE->_is_local) {
 | 
			
		||||
	nbr = in_v[SE->_offset];
 | 
			
		||||
      } else {
 | 
			
		||||
	nbr = Stencil.CommBuf()[SE->_offset];
 | 
			
		||||
      }
 | 
			
		||||
      synchronise();
 | 
			
		||||
 | 
			
		||||
      res = res + Aview_p[point][ss]*nbr;
 | 
			
		||||
      
 | 
			
		||||
      out_v[ss]=res;
 | 
			
		||||
    });
 | 
			
		||||
#endif
 | 
			
		||||
    for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
 | 
			
		||||
  }
 | 
			
		||||
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
 | 
			
		||||
  {
 | 
			
		||||
@@ -841,10 +562,10 @@ public:
 | 
			
		||||
	    
 | 
			
		||||
	    blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
 | 
			
		||||
	    
 | 
			
		||||
	    auto iZProj_v = iZProj.View() ;
 | 
			
		||||
	    auto oZProj_v = oZProj.View() ;
 | 
			
		||||
	    auto A_p     =  A[p].View();
 | 
			
		||||
	    auto A_self  = A[self_stencil].View();
 | 
			
		||||
	    autoView( iZProj_v , iZProj, AcceleratorRead) ;
 | 
			
		||||
	    autoView( oZProj_v , oZProj, AcceleratorRead) ;
 | 
			
		||||
	    autoView( A_p     ,  A[p], AcceleratorWrite);
 | 
			
		||||
	    autoView( A_self  , A[self_stencil], AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
 | 
			
		||||
 | 
			
		||||
@@ -860,11 +581,11 @@ public:
 | 
			
		||||
	mult(tmp,phi,oddmask );  linop.Op(tmp,Mphio);
 | 
			
		||||
 | 
			
		||||
	{
 | 
			
		||||
	  auto tmp_      = tmp.View();
 | 
			
		||||
	  auto evenmask_ = evenmask.View();
 | 
			
		||||
	  auto oddmask_  =  oddmask.View();
 | 
			
		||||
	  auto Mphie_    =  Mphie.View();
 | 
			
		||||
	  auto Mphio_    =  Mphio.View();
 | 
			
		||||
	  autoView( tmp_      , tmp, AcceleratorWrite);
 | 
			
		||||
	  autoView( evenmask_ , evenmask, AcceleratorRead);
 | 
			
		||||
	  autoView( oddmask_  ,  oddmask, AcceleratorRead);
 | 
			
		||||
	  autoView( Mphie_    ,  Mphie, AcceleratorRead);
 | 
			
		||||
	  autoView( Mphio_    ,  Mphio, AcceleratorRead);
 | 
			
		||||
	  accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ 
 | 
			
		||||
	      coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
 | 
			
		||||
	    });
 | 
			
		||||
@@ -872,8 +593,8 @@ public:
 | 
			
		||||
 | 
			
		||||
	blockProject(SelfProj,tmp,Subspace.subspace);
 | 
			
		||||
 | 
			
		||||
	auto SelfProj_ = SelfProj.View();
 | 
			
		||||
	auto A_self  = A[self_stencil].View();
 | 
			
		||||
	autoView( SelfProj_ , SelfProj, AcceleratorRead);
 | 
			
		||||
	autoView( A_self  , A[self_stencil], AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
	accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
 | 
			
		||||
	  for(int j=0;j<nbasis;j++){
 | 
			
		||||
@@ -887,33 +608,8 @@ public:
 | 
			
		||||
      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
 | 
			
		||||
      ForceHermitian();
 | 
			
		||||
    }
 | 
			
		||||
      // AssertHermitian();
 | 
			
		||||
      // ForceDiagonal();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
    ///////////////////////////
 | 
			
		||||
    // test code worth preserving in if block
 | 
			
		||||
    ///////////////////////////
 | 
			
		||||
    std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
 | 
			
		||||
    for(int p=0;p<geom.npoint;p++){
 | 
			
		||||
      std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage<< A[p] << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
 | 
			
		||||
 | 
			
		||||
    phi=Subspace.subspace[0];
 | 
			
		||||
    std::vector<int> bc(FineGrid->_ndimension,0);
 | 
			
		||||
 | 
			
		||||
    blockPick(Grid(),phi,tmp,bc);      // Pick out a block
 | 
			
		||||
    linop.Op(tmp,Mphi);                // Apply big dop
 | 
			
		||||
    blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
 | 
			
		||||
    std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<< iProj <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  void ForceHermitian(void) {
 | 
			
		||||
    CoarseMatrix Diff  (Grid());
 | 
			
		||||
    for(int p=0;p<geom.npoint;p++){
 | 
			
		||||
@@ -933,27 +629,6 @@ public:
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  void AssertHermitian(void) {
 | 
			
		||||
    CoarseMatrix AA    (Grid());
 | 
			
		||||
    CoarseMatrix AAc   (Grid());
 | 
			
		||||
    CoarseMatrix Diff  (Grid());
 | 
			
		||||
    for(int d=0;d<4;d++){
 | 
			
		||||
	
 | 
			
		||||
      int dd=d+1;
 | 
			
		||||
      AAc = Cshift(A[2*d+1],dd,1);
 | 
			
		||||
      AA  = A[2*d];
 | 
			
		||||
	
 | 
			
		||||
      Diff = AA - adj(AAc);
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
 | 
			
		||||
	  
 | 
			
		||||
    }
 | 
			
		||||
    Diff = A[8] - adj(A[8]);
 | 
			
		||||
    std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
    
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,3 @@
 | 
			
		||||
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
@@ -37,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
template<class scalar> struct FFTW { };
 | 
			
		||||
@@ -191,7 +189,7 @@ public:
 | 
			
		||||
    typedef typename sobj::scalar_type   scalar;
 | 
			
		||||
      
 | 
			
		||||
    Lattice<sobj> pgbuf(&pencil_g);
 | 
			
		||||
    auto pgbuf_v = pgbuf.View();
 | 
			
		||||
    autoView(pgbuf_v , pgbuf, CpuWrite);
 | 
			
		||||
 | 
			
		||||
    typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
 | 
			
		||||
    typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
 | 
			
		||||
@@ -232,15 +230,18 @@ public:
 | 
			
		||||
    result = source;
 | 
			
		||||
    int pc = processor_coor[dim];
 | 
			
		||||
    for(int p=0;p<processors[dim];p++) {
 | 
			
		||||
      thread_for(idx, sgrid->lSites(),{
 | 
			
		||||
      {
 | 
			
		||||
	autoView(r_v,result,CpuRead);
 | 
			
		||||
	autoView(p_v,pgbuf,CpuWrite);
 | 
			
		||||
	thread_for(idx, sgrid->lSites(),{
 | 
			
		||||
          Coordinate cbuf(Nd);
 | 
			
		||||
          sobj s;
 | 
			
		||||
	  sgrid->LocalIndexToLocalCoor(idx,cbuf);
 | 
			
		||||
	  peekLocalSite(s,result,cbuf);
 | 
			
		||||
	  peekLocalSite(s,r_v,cbuf);
 | 
			
		||||
	  cbuf[dim]+=((pc+p) % processors[dim])*L;
 | 
			
		||||
	  //            cbuf[dim]+=p*L;
 | 
			
		||||
	  pokeLocalSite(s,pgbuf,cbuf);
 | 
			
		||||
      });
 | 
			
		||||
	  pokeLocalSite(s,p_v,cbuf);
 | 
			
		||||
        });
 | 
			
		||||
      }
 | 
			
		||||
      if (p != processors[dim] - 1) {
 | 
			
		||||
	result = Cshift(result,dim,L);
 | 
			
		||||
      }
 | 
			
		||||
@@ -269,15 +270,19 @@ public:
 | 
			
		||||
    flops+= flops_call*NN;
 | 
			
		||||
      
 | 
			
		||||
    // writing out result
 | 
			
		||||
    thread_for(idx,sgrid->lSites(),{
 | 
			
		||||
    {
 | 
			
		||||
      autoView(pgbuf_v,pgbuf,CpuRead);
 | 
			
		||||
      autoView(result_v,result,CpuWrite);
 | 
			
		||||
      thread_for(idx,sgrid->lSites(),{
 | 
			
		||||
	Coordinate clbuf(Nd), cgbuf(Nd);
 | 
			
		||||
	sobj s;
 | 
			
		||||
	sgrid->LocalIndexToLocalCoor(idx,clbuf);
 | 
			
		||||
	cgbuf = clbuf;
 | 
			
		||||
	cgbuf[dim] = clbuf[dim]+L*pc;
 | 
			
		||||
	peekLocalSite(s,pgbuf,cgbuf);
 | 
			
		||||
	pokeLocalSite(s,result,clbuf);
 | 
			
		||||
    });
 | 
			
		||||
	peekLocalSite(s,pgbuf_v,cgbuf);
 | 
			
		||||
	pokeLocalSite(s,result_v,clbuf);
 | 
			
		||||
      });
 | 
			
		||||
    }
 | 
			
		||||
    result = result*div;
 | 
			
		||||
      
 | 
			
		||||
    // destroying plan
 | 
			
		||||
 
 | 
			
		||||
@@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction<Field>
 | 
			
		||||
 | 
			
		||||
        LinearCombTimer.Start();
 | 
			
		||||
        bo = beta * omega;
 | 
			
		||||
        auto p_v = p.View();
 | 
			
		||||
        auto r_v = r.View();
 | 
			
		||||
        auto v_v = v.View();
 | 
			
		||||
        accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
 | 
			
		||||
          coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
 | 
			
		||||
        });
 | 
			
		||||
	{
 | 
			
		||||
	  autoView( p_v , p, AcceleratorWrite);
 | 
			
		||||
	  autoView( r_v , r, AcceleratorRead);
 | 
			
		||||
	  autoView( v_v , v, AcceleratorRead);
 | 
			
		||||
	  accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
 | 
			
		||||
	      coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
 | 
			
		||||
	    });
 | 
			
		||||
	}
 | 
			
		||||
        LinearCombTimer.Stop();
 | 
			
		||||
        LinalgTimer.Stop();
 | 
			
		||||
 | 
			
		||||
@@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction<Field>
 | 
			
		||||
        alpha = rho / Calpha.real();
 | 
			
		||||
 | 
			
		||||
        LinearCombTimer.Start();
 | 
			
		||||
        auto h_v = h.View();
 | 
			
		||||
        auto psi_v = psi.View();
 | 
			
		||||
        accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
 | 
			
		||||
          coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
 | 
			
		||||
        });
 | 
			
		||||
        
 | 
			
		||||
        auto s_v = s.View();
 | 
			
		||||
        accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
 | 
			
		||||
          coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
 | 
			
		||||
        });
 | 
			
		||||
	{
 | 
			
		||||
	  autoView( p_v , p, AcceleratorRead);
 | 
			
		||||
	  autoView( r_v , r, AcceleratorRead);
 | 
			
		||||
	  autoView( v_v , v, AcceleratorRead);
 | 
			
		||||
	  autoView( psi_v,psi, AcceleratorRead);
 | 
			
		||||
	  autoView( h_v  ,  h, AcceleratorWrite);
 | 
			
		||||
	  autoView( s_v  ,  s, AcceleratorWrite);
 | 
			
		||||
	  accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
 | 
			
		||||
	      coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
 | 
			
		||||
	    });
 | 
			
		||||
	  accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
 | 
			
		||||
	      coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
 | 
			
		||||
 	  });
 | 
			
		||||
        }
 | 
			
		||||
        LinearCombTimer.Stop();
 | 
			
		||||
        LinalgTimer.Stop();
 | 
			
		||||
 | 
			
		||||
@@ -166,13 +172,19 @@ class BiCGSTAB : public OperatorFunction<Field>
 | 
			
		||||
        omega = Comega.real() / norm2(t);
 | 
			
		||||
 | 
			
		||||
        LinearCombTimer.Start();
 | 
			
		||||
        auto t_v = t.View();
 | 
			
		||||
        accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
 | 
			
		||||
          coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
 | 
			
		||||
          coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
 | 
			
		||||
        });
 | 
			
		||||
	{
 | 
			
		||||
	  autoView( psi_v,psi, AcceleratorWrite);
 | 
			
		||||
	  autoView( r_v , r, AcceleratorWrite);
 | 
			
		||||
	  autoView( h_v , h, AcceleratorRead);
 | 
			
		||||
	  autoView( s_v , s, AcceleratorRead);
 | 
			
		||||
	  autoView( t_v , t, AcceleratorRead);
 | 
			
		||||
	  accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
 | 
			
		||||
	      coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
 | 
			
		||||
	      coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
 | 
			
		||||
	    });
 | 
			
		||||
	}
 | 
			
		||||
        LinearCombTimer.Stop();
 | 
			
		||||
 | 
			
		||||
	
 | 
			
		||||
        cp = norm2(r);
 | 
			
		||||
        LinalgTimer.Stop();
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -140,13 +140,15 @@ public:
 | 
			
		||||
      b = cp / c;
 | 
			
		||||
 | 
			
		||||
      LinearCombTimer.Start();
 | 
			
		||||
      auto psi_v = psi.View();
 | 
			
		||||
      auto p_v   = p.View();
 | 
			
		||||
      auto r_v   = r.View();
 | 
			
		||||
      accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
 | 
			
		||||
	  coalescedWrite(psi_v[ss], a      *  p_v(ss) + psi_v(ss));
 | 
			
		||||
	  coalescedWrite(p_v[ss]  , b      *  p_v(ss) + r_v  (ss));
 | 
			
		||||
      });
 | 
			
		||||
      {
 | 
			
		||||
	autoView( psi_v , psi, AcceleratorWrite);
 | 
			
		||||
	autoView( p_v   , p,   AcceleratorWrite);
 | 
			
		||||
	autoView( r_v   , r,   AcceleratorWrite);
 | 
			
		||||
	accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
 | 
			
		||||
	    coalescedWrite(psi_v[ss], a      *  p_v(ss) + psi_v(ss));
 | 
			
		||||
	    coalescedWrite(p_v[ss]  , b      *  p_v(ss) + r_v  (ss));
 | 
			
		||||
	});
 | 
			
		||||
      }
 | 
			
		||||
      LinearCombTimer.Stop();
 | 
			
		||||
      LinalgTimer.Stop();
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,241 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#ifndef GRID_PREC_GCR_NON_HERM_H
 | 
			
		||||
#define GRID_PREC_GCR_NON_HERM_H
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//VPGCR Abe and Zhang, 2005.
 | 
			
		||||
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
 | 
			
		||||
//Computing and Information Volume 2, Number 2, Pages 147-161
 | 
			
		||||
//NB. Likely not original reference since they are focussing on a preconditioner variant.
 | 
			
		||||
//    but VPGCR was nicely written up in their paper
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" " 
 | 
			
		||||
 | 
			
		||||
template<class Field>
 | 
			
		||||
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
 | 
			
		||||
public:                                                
 | 
			
		||||
 | 
			
		||||
  RealD   Tolerance;
 | 
			
		||||
  Integer MaxIterations;
 | 
			
		||||
  int verbose;
 | 
			
		||||
  int mmax;
 | 
			
		||||
  int nstep;
 | 
			
		||||
  int steps;
 | 
			
		||||
  int level;
 | 
			
		||||
  GridStopWatch PrecTimer;
 | 
			
		||||
  GridStopWatch MatTimer;
 | 
			
		||||
  GridStopWatch LinalgTimer;
 | 
			
		||||
 | 
			
		||||
  LinearFunction<Field>     &Preconditioner;
 | 
			
		||||
  LinearOperatorBase<Field> &Linop;
 | 
			
		||||
 | 
			
		||||
  void Level(int lv) { level=lv; };
 | 
			
		||||
 | 
			
		||||
  PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
 | 
			
		||||
    Tolerance(tol), 
 | 
			
		||||
    MaxIterations(maxit),
 | 
			
		||||
    Linop(_Linop),
 | 
			
		||||
    Preconditioner(Prec),
 | 
			
		||||
    mmax(_mmax),
 | 
			
		||||
    nstep(_nstep)
 | 
			
		||||
  { 
 | 
			
		||||
    level=1;
 | 
			
		||||
    verbose=1;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  void operator() (const Field &src, Field &psi){
 | 
			
		||||
 | 
			
		||||
    psi=Zero();
 | 
			
		||||
    RealD cp, ssq,rsq;
 | 
			
		||||
    ssq=norm2(src);
 | 
			
		||||
    rsq=Tolerance*Tolerance*ssq;
 | 
			
		||||
      
 | 
			
		||||
    Field r(src.Grid());
 | 
			
		||||
 | 
			
		||||
    PrecTimer.Reset();
 | 
			
		||||
    MatTimer.Reset();
 | 
			
		||||
    LinalgTimer.Reset();
 | 
			
		||||
 | 
			
		||||
    GridStopWatch SolverTimer;
 | 
			
		||||
    SolverTimer.Start();
 | 
			
		||||
 | 
			
		||||
    steps=0;
 | 
			
		||||
    for(int k=0;k<MaxIterations;k++){
 | 
			
		||||
 | 
			
		||||
      cp=GCRnStep(src,psi,rsq);
 | 
			
		||||
 | 
			
		||||
      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
 | 
			
		||||
 | 
			
		||||
      if(cp<rsq) {
 | 
			
		||||
 | 
			
		||||
	SolverTimer.Stop();
 | 
			
		||||
 | 
			
		||||
	Linop.Op(psi,r);
 | 
			
		||||
	axpy(r,-1.0,src,r);
 | 
			
		||||
	RealD tr = norm2(r);
 | 
			
		||||
	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
 | 
			
		||||
		 << " computed residual "<<sqrt(cp/ssq)
 | 
			
		||||
		 << " true residual "    <<sqrt(tr/ssq)
 | 
			
		||||
		 << " target "           <<Tolerance <<std::endl;
 | 
			
		||||
 | 
			
		||||
	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
 | 
			
		||||
	return;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
 | 
			
		||||
    //    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
 | 
			
		||||
 | 
			
		||||
    RealD cp;
 | 
			
		||||
    ComplexD a, b, zAz;
 | 
			
		||||
    RealD zAAz;
 | 
			
		||||
    ComplexD rq;
 | 
			
		||||
 | 
			
		||||
    GridBase *grid = src.Grid();
 | 
			
		||||
 | 
			
		||||
    Field r(grid);
 | 
			
		||||
    Field z(grid);
 | 
			
		||||
    Field tmp(grid);
 | 
			
		||||
    Field ttmp(grid);
 | 
			
		||||
    Field Az(grid);
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////
 | 
			
		||||
    // history for flexible orthog
 | 
			
		||||
    ////////////////////////////////
 | 
			
		||||
    std::vector<Field> q(mmax,grid);
 | 
			
		||||
    std::vector<Field> p(mmax,grid);
 | 
			
		||||
    std::vector<RealD> qq(mmax);
 | 
			
		||||
      
 | 
			
		||||
    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////
 | 
			
		||||
    // initial guess x0 is taken as nonzero.
 | 
			
		||||
    // r0=src-A x0 = src
 | 
			
		||||
    //////////////////////////////////
 | 
			
		||||
    MatTimer.Start();
 | 
			
		||||
    Linop.Op(psi,Az);
 | 
			
		||||
    zAz = innerProduct(Az,psi);
 | 
			
		||||
    zAAz= norm2(Az);
 | 
			
		||||
    MatTimer.Stop();
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    LinalgTimer.Start();
 | 
			
		||||
    r=src-Az;
 | 
			
		||||
    LinalgTimer.Stop();
 | 
			
		||||
    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    /////////////////////
 | 
			
		||||
    // p = Prec(r)
 | 
			
		||||
    /////////////////////
 | 
			
		||||
 | 
			
		||||
    PrecTimer.Start();
 | 
			
		||||
    Preconditioner(r,z);
 | 
			
		||||
    PrecTimer.Stop();
 | 
			
		||||
 | 
			
		||||
    MatTimer.Start();
 | 
			
		||||
    Linop.Op(z,Az);
 | 
			
		||||
    MatTimer.Stop();
 | 
			
		||||
 | 
			
		||||
    LinalgTimer.Start();
 | 
			
		||||
 | 
			
		||||
    zAz = innerProduct(Az,psi);
 | 
			
		||||
    zAAz= norm2(Az);
 | 
			
		||||
 | 
			
		||||
    //p[0],q[0],qq[0] 
 | 
			
		||||
    p[0]= z;
 | 
			
		||||
    q[0]= Az;
 | 
			
		||||
    qq[0]= zAAz;
 | 
			
		||||
    
 | 
			
		||||
    cp =norm2(r);
 | 
			
		||||
    LinalgTimer.Stop();
 | 
			
		||||
 | 
			
		||||
    for(int k=0;k<nstep;k++){
 | 
			
		||||
 | 
			
		||||
      steps++;
 | 
			
		||||
 | 
			
		||||
      int kp     = k+1;
 | 
			
		||||
      int peri_k = k %mmax;
 | 
			
		||||
      int peri_kp= kp%mmax;
 | 
			
		||||
 | 
			
		||||
      LinalgTimer.Start();
 | 
			
		||||
      rq= innerProduct(q[peri_k],r); // what if rAr not real?
 | 
			
		||||
      a = rq/qq[peri_k];
 | 
			
		||||
 | 
			
		||||
      axpy(psi,a,p[peri_k],psi);         
 | 
			
		||||
 | 
			
		||||
      cp = axpy_norm(r,-a,q[peri_k],r);
 | 
			
		||||
      LinalgTimer.Stop();
 | 
			
		||||
 | 
			
		||||
      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
 | 
			
		||||
 | 
			
		||||
      if((k==nstep-1)||(cp<rsq)){
 | 
			
		||||
	return cp;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      PrecTimer.Start();
 | 
			
		||||
      Preconditioner(r,z);// solve Az = r
 | 
			
		||||
      PrecTimer.Stop();
 | 
			
		||||
 | 
			
		||||
      MatTimer.Start();
 | 
			
		||||
      Linop.Op(z,Az);
 | 
			
		||||
      MatTimer.Stop();
 | 
			
		||||
      zAz = innerProduct(Az,psi);
 | 
			
		||||
      zAAz= norm2(Az);
 | 
			
		||||
 | 
			
		||||
      LinalgTimer.Start();
 | 
			
		||||
 | 
			
		||||
      q[peri_kp]=Az;
 | 
			
		||||
      p[peri_kp]=z;
 | 
			
		||||
 | 
			
		||||
      int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history.
 | 
			
		||||
      for(int back=0;back<northog;back++){
 | 
			
		||||
 | 
			
		||||
	int peri_back=(k-back)%mmax;   	  assert((k-back)>=0);
 | 
			
		||||
 | 
			
		||||
	b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
 | 
			
		||||
	p[peri_kp]=p[peri_kp]+b*p[peri_back];
 | 
			
		||||
	q[peri_kp]=q[peri_kp]+b*q[peri_back];
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
 | 
			
		||||
      LinalgTimer.Stop();
 | 
			
		||||
    }
 | 
			
		||||
    assert(0); // never reached
 | 
			
		||||
    return cp;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
#endif
 | 
			
		||||
@@ -6,93 +6,6 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
MemoryStats *MemoryProfiler::stats = nullptr;
 | 
			
		||||
bool         MemoryProfiler::debug = false;
 | 
			
		||||
 | 
			
		||||
int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
int PointerCache::Ncache      = 32;
 | 
			
		||||
#else 
 | 
			
		||||
int PointerCache::Ncache      = 8;
 | 
			
		||||
#endif
 | 
			
		||||
int PointerCache::Victim;
 | 
			
		||||
int PointerCache::VictimSmall;
 | 
			
		||||
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
 | 
			
		||||
PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
 | 
			
		||||
 | 
			
		||||
void PointerCache::Init(void)
 | 
			
		||||
{
 | 
			
		||||
  char * str;
 | 
			
		||||
 | 
			
		||||
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
 | 
			
		||||
  if ( str ) Ncache = atoi(str);
 | 
			
		||||
  if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
 | 
			
		||||
 | 
			
		||||
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
 | 
			
		||||
  if ( str ) NcacheSmall = atoi(str);
 | 
			
		||||
  if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
 | 
			
		||||
 | 
			
		||||
  //  printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
 | 
			
		||||
}
 | 
			
		||||
void *PointerCache::Insert(void *ptr,size_t bytes) 
 | 
			
		||||
{
 | 
			
		||||
  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
 | 
			
		||||
    return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
 | 
			
		||||
  return Insert(ptr,bytes,Entries,Ncache,Victim);  
 | 
			
		||||
}
 | 
			
		||||
void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  assert(omp_in_parallel()==0);
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
  void * ret = NULL;
 | 
			
		||||
  int v = -1;
 | 
			
		||||
 | 
			
		||||
  for(int e=0;e<ncache;e++) {
 | 
			
		||||
    if ( entries[e].valid==0 ) {
 | 
			
		||||
      v=e; 
 | 
			
		||||
      break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( v==-1 ) {
 | 
			
		||||
    v=victim;
 | 
			
		||||
    victim = (victim+1)%ncache;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( entries[v].valid ) {
 | 
			
		||||
    ret = entries[v].address;
 | 
			
		||||
    entries[v].valid = 0;
 | 
			
		||||
    entries[v].address = NULL;
 | 
			
		||||
    entries[v].bytes = 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  entries[v].address=ptr;
 | 
			
		||||
  entries[v].bytes  =bytes;
 | 
			
		||||
  entries[v].valid  =1;
 | 
			
		||||
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void *PointerCache::Lookup(size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
 | 
			
		||||
    return Lookup(bytes,EntriesSmall,NcacheSmall);
 | 
			
		||||
  return Lookup(bytes,Entries,Ncache);
 | 
			
		||||
}
 | 
			
		||||
void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  assert(omp_in_parallel()==0);
 | 
			
		||||
#endif 
 | 
			
		||||
  for(int e=0;e<ncache;e++){
 | 
			
		||||
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
 | 
			
		||||
      entries[e].valid = 0;
 | 
			
		||||
      return entries[e].address;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  return NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void check_huge_pages(void *Buf,uint64_t BYTES)
 | 
			
		||||
{
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
 
 | 
			
		||||
@@ -26,129 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#ifndef GRID_ALIGNED_ALLOCATOR_H
 | 
			
		||||
#define GRID_ALIGNED_ALLOCATOR_H
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_MALLOC_MALLOC_H
 | 
			
		||||
#include <malloc/malloc.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef HAVE_MALLOC_H
 | 
			
		||||
#include <malloc.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
#include <mm_malloc.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define POINTER_CACHE
 | 
			
		||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
 | 
			
		||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
// Move control to configure.ac and Config.h?
 | 
			
		||||
 | 
			
		||||
class PointerCache {
 | 
			
		||||
private:
 | 
			
		||||
/*Pinning pages is costly*/
 | 
			
		||||
/*Could maintain separate large and small allocation caches*/
 | 
			
		||||
/* Could make these configurable, perhaps up to a max size*/
 | 
			
		||||
  static const int NcacheSmallMax=128; 
 | 
			
		||||
  static const int NcacheMax=16;
 | 
			
		||||
  static int NcacheSmall;
 | 
			
		||||
  static int Ncache;
 | 
			
		||||
 | 
			
		||||
  typedef struct { 
 | 
			
		||||
    void *address;
 | 
			
		||||
    size_t bytes;
 | 
			
		||||
    int valid;
 | 
			
		||||
  } PointerCacheEntry;
 | 
			
		||||
    
 | 
			
		||||
  static PointerCacheEntry Entries[NcacheMax];
 | 
			
		||||
  static int Victim;
 | 
			
		||||
  static PointerCacheEntry EntriesSmall[NcacheSmallMax];
 | 
			
		||||
  static int VictimSmall;
 | 
			
		||||
 | 
			
		||||
public:
 | 
			
		||||
  static void Init(void);
 | 
			
		||||
  static void *Insert(void *ptr,size_t bytes) ;
 | 
			
		||||
  static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
 | 
			
		||||
  static void *Lookup(size_t bytes) ;
 | 
			
		||||
  static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
std::string sizeString(size_t bytes);
 | 
			
		||||
 | 
			
		||||
struct MemoryStats
 | 
			
		||||
{
 | 
			
		||||
  size_t totalAllocated{0}, maxAllocated{0}, 
 | 
			
		||||
    currentlyAllocated{0}, totalFreed{0};
 | 
			
		||||
};
 | 
			
		||||
    
 | 
			
		||||
class MemoryProfiler
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  static MemoryStats *stats;
 | 
			
		||||
  static bool        debug;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#define profilerCudaMeminfo \
 | 
			
		||||
  { size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<<f<<"/"<<t << std::endl;}
 | 
			
		||||
#else
 | 
			
		||||
#define profilerCudaMeminfo
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
 | 
			
		||||
#define profilerDebugPrint						\
 | 
			
		||||
  if (MemoryProfiler::stats)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      auto s = MemoryProfiler::stats;					\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
 | 
			
		||||
		<< std::endl;						\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
 | 
			
		||||
		<< std::endl;						\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
 | 
			
		||||
		<< std::endl;						\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
 | 
			
		||||
		<< std::endl;						\
 | 
			
		||||
    }									\
 | 
			
		||||
  profilerCudaMeminfo;
 | 
			
		||||
 | 
			
		||||
#define profilerAllocate(bytes)						\
 | 
			
		||||
  if (MemoryProfiler::stats)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      auto s = MemoryProfiler::stats;					\
 | 
			
		||||
      s->totalAllocated     += (bytes);					\
 | 
			
		||||
      s->currentlyAllocated += (bytes);					\
 | 
			
		||||
      s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated); \
 | 
			
		||||
    }									\
 | 
			
		||||
  if (MemoryProfiler::debug)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
 | 
			
		||||
      profilerDebugPrint;						\
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#define profilerFree(bytes)						\
 | 
			
		||||
  if (MemoryProfiler::stats)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      auto s = MemoryProfiler::stats;					\
 | 
			
		||||
      s->totalFreed         += (bytes);					\
 | 
			
		||||
      s->currentlyAllocated -= (bytes);					\
 | 
			
		||||
    }									\
 | 
			
		||||
  if (MemoryProfiler::debug)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
 | 
			
		||||
      profilerDebugPrint;						\
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
void check_huge_pages(void *Buf,uint64_t BYTES);
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
// A lattice of something, but assume the something is SIMDized.
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template<typename _Tp>
 | 
			
		||||
class alignedAllocator {
 | 
			
		||||
public: 
 | 
			
		||||
@@ -172,70 +53,60 @@ public:
 | 
			
		||||
  { 
 | 
			
		||||
    size_type bytes = __n*sizeof(_Tp);
 | 
			
		||||
    profilerAllocate(bytes);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#ifdef POINTER_CACHE
 | 
			
		||||
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
 | 
			
		||||
#else
 | 
			
		||||
    pointer ptr = nullptr;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    // Unified (managed) memory
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) {
 | 
			
		||||
      //      printf(" alignedAllocater cache miss %ld bytes ",bytes);      BACKTRACEFP(stdout);
 | 
			
		||||
      auto err = cudaMallocManaged((void **)&ptr,bytes);
 | 
			
		||||
      if( err != cudaSuccess ) {
 | 
			
		||||
	ptr = (_Tp *) NULL;
 | 
			
		||||
	std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
 | 
			
		||||
	assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    } 
 | 
			
		||||
    assert( ptr != (_Tp *)NULL);
 | 
			
		||||
#else 
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // 2MB align; could make option probably doesn't need configurability
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  #ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 | 
			
		||||
  #else
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 | 
			
		||||
  #endif
 | 
			
		||||
    assert( ptr != (_Tp *)NULL);
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////////
 | 
			
		||||
    // First touch optimise in threaded loop 
 | 
			
		||||
    //////////////////////////////////////////////////
 | 
			
		||||
    uint64_t *cp = (uint64_t *)ptr;
 | 
			
		||||
    thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
 | 
			
		||||
      cp[n]=0;
 | 
			
		||||
    });
 | 
			
		||||
#endif
 | 
			
		||||
    _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
 | 
			
		||||
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void deallocate(pointer __p, size_type __n) { 
 | 
			
		||||
  void deallocate(pointer __p, size_type __n) 
 | 
			
		||||
  { 
 | 
			
		||||
    size_type bytes = __n * sizeof(_Tp);
 | 
			
		||||
 | 
			
		||||
    profilerFree(bytes);
 | 
			
		||||
    MemoryManager::CpuFree((void *)__p,bytes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#ifdef POINTER_CACHE
 | 
			
		||||
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 | 
			
		||||
#else 
 | 
			
		||||
    pointer __freeme = __p;
 | 
			
		||||
#endif
 | 
			
		||||
  // FIXME: hack for the copy constructor, eventually it must be avoided
 | 
			
		||||
  //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
 | 
			
		||||
  void construct(pointer __p, const _Tp& __val) { assert(0);};
 | 
			
		||||
  void construct(pointer __p) { };
 | 
			
		||||
  void destroy(pointer __p) { };
 | 
			
		||||
};
 | 
			
		||||
template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 | 
			
		||||
template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
    if ( __freeme ) cudaFree((void *)__freeme);
 | 
			
		||||
#else 
 | 
			
		||||
  #ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    if ( __freeme ) _mm_free((void *)__freeme); 
 | 
			
		||||
  #else
 | 
			
		||||
    if ( __freeme ) free((void *)__freeme);
 | 
			
		||||
  #endif
 | 
			
		||||
#endif
 | 
			
		||||
template<typename _Tp>
 | 
			
		||||
class uvmAllocator {
 | 
			
		||||
public: 
 | 
			
		||||
  typedef std::size_t     size_type;
 | 
			
		||||
  typedef std::ptrdiff_t  difference_type;
 | 
			
		||||
  typedef _Tp*       pointer;
 | 
			
		||||
  typedef const _Tp* const_pointer;
 | 
			
		||||
  typedef _Tp&       reference;
 | 
			
		||||
  typedef const _Tp& const_reference;
 | 
			
		||||
  typedef _Tp        value_type;
 | 
			
		||||
 | 
			
		||||
  template<typename _Tp1>  struct rebind { typedef uvmAllocator<_Tp1> other; };
 | 
			
		||||
  uvmAllocator() throw() { }
 | 
			
		||||
  uvmAllocator(const uvmAllocator&) throw() { }
 | 
			
		||||
  template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
 | 
			
		||||
  ~uvmAllocator() throw() { }
 | 
			
		||||
  pointer       address(reference __x)       const { return &__x; }
 | 
			
		||||
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 | 
			
		||||
 | 
			
		||||
  pointer allocate(size_type __n, const void* _p= 0)
 | 
			
		||||
  { 
 | 
			
		||||
    size_type bytes = __n*sizeof(_Tp);
 | 
			
		||||
    profilerAllocate(bytes);
 | 
			
		||||
    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
 | 
			
		||||
    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void deallocate(pointer __p, size_type __n) 
 | 
			
		||||
  { 
 | 
			
		||||
    size_type bytes = __n * sizeof(_Tp);
 | 
			
		||||
    profilerFree(bytes);
 | 
			
		||||
    MemoryManager::SharedFree((void *)__p,bytes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // FIXME: hack for the copy constructor, eventually it must be avoided
 | 
			
		||||
@@ -244,17 +115,17 @@ public:
 | 
			
		||||
  void construct(pointer __p) { };
 | 
			
		||||
  void destroy(pointer __p) { };
 | 
			
		||||
};
 | 
			
		||||
template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 | 
			
		||||
template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 | 
			
		||||
template<typename _Tp>  inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
 | 
			
		||||
template<typename _Tp>  inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Template typedefs
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class T> using commAllocator = alignedAllocator<T>;
 | 
			
		||||
template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
 | 
			
		||||
template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
 | 
			
		||||
template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 | 
			
		||||
template<class T> using commAllocator = uvmAllocator<T>;
 | 
			
		||||
template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
 | 
			
		||||
template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
 | 
			
		||||
//template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										4
									
								
								Grid/allocator/Allocator.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								Grid/allocator/Allocator.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,4 @@
 | 
			
		||||
#pragma once
 | 
			
		||||
#include <Grid/allocator/MemoryStats.h>
 | 
			
		||||
#include <Grid/allocator/MemoryManager.h>
 | 
			
		||||
#include <Grid/allocator/AlignedAllocator.h>
 | 
			
		||||
							
								
								
									
										244
									
								
								Grid/allocator/MemoryManager.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										244
									
								
								Grid/allocator/MemoryManager.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,244 @@
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
/*Allocation types, saying which pointer cache should be used*/
 | 
			
		||||
#define Cpu      (0)
 | 
			
		||||
#define CpuSmall (1)
 | 
			
		||||
#define Acc      (2)
 | 
			
		||||
#define AccSmall (3)
 | 
			
		||||
#define Shared   (4)
 | 
			
		||||
#define SharedSmall (5)
 | 
			
		||||
uint64_t total_shared;
 | 
			
		||||
uint64_t total_device;
 | 
			
		||||
uint64_t total_host;;
 | 
			
		||||
void MemoryManager::PrintBytes(void)
 | 
			
		||||
{
 | 
			
		||||
  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
 | 
			
		||||
  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
 | 
			
		||||
  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Data tables for recently freed pooiniter caches
 | 
			
		||||
//////////////////////////////////////////////////////////////////////
 | 
			
		||||
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 | 
			
		||||
int MemoryManager::Victim[MemoryManager::NallocType];
 | 
			
		||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Actual allocation and deallocation utils
 | 
			
		||||
//////////////////////////////////////////////////////////////////////
 | 
			
		||||
void *MemoryManager::AcceleratorAllocate(size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  void *ptr = (void *) Lookup(bytes,Acc);
 | 
			
		||||
  if ( ptr == (void *) NULL ) {
 | 
			
		||||
    ptr = (void *) acceleratorAllocDevice(bytes);
 | 
			
		||||
    total_device+=bytes;
 | 
			
		||||
  }
 | 
			
		||||
  return ptr;
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  void *__freeme = Insert(ptr,bytes,Acc);
 | 
			
		||||
  if ( __freeme ) {
 | 
			
		||||
    acceleratorFreeDevice(__freeme);
 | 
			
		||||
    total_device-=bytes;
 | 
			
		||||
    //    PrintBytes();
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
void *MemoryManager::SharedAllocate(size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  void *ptr = (void *) Lookup(bytes,Shared);
 | 
			
		||||
  if ( ptr == (void *) NULL ) {
 | 
			
		||||
    ptr = (void *) acceleratorAllocShared(bytes);
 | 
			
		||||
    total_shared+=bytes;
 | 
			
		||||
    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
 | 
			
		||||
    //    PrintBytes();
 | 
			
		||||
  }
 | 
			
		||||
  return ptr;
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  void *__freeme = Insert(ptr,bytes,Shared);
 | 
			
		||||
  if ( __freeme ) {
 | 
			
		||||
    acceleratorFreeShared(__freeme);
 | 
			
		||||
    total_shared-=bytes;
 | 
			
		||||
    //    PrintBytes();
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#ifdef GRID_UVM
 | 
			
		||||
void *MemoryManager::CpuAllocate(size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  void *ptr = (void *) Lookup(bytes,Cpu);
 | 
			
		||||
  if ( ptr == (void *) NULL ) {
 | 
			
		||||
    ptr = (void *) acceleratorAllocShared(bytes);
 | 
			
		||||
    total_host+=bytes;
 | 
			
		||||
  }
 | 
			
		||||
  return ptr;
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  NotifyDeletion(_ptr);
 | 
			
		||||
  void *__freeme = Insert(_ptr,bytes,Cpu);
 | 
			
		||||
  if ( __freeme ) { 
 | 
			
		||||
    acceleratorFreeShared(__freeme);
 | 
			
		||||
    total_host-=bytes;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
void *MemoryManager::CpuAllocate(size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  void *ptr = (void *) Lookup(bytes,Cpu);
 | 
			
		||||
  if ( ptr == (void *) NULL ) {
 | 
			
		||||
    ptr = (void *) acceleratorAllocCpu(bytes);
 | 
			
		||||
    total_host+=bytes;
 | 
			
		||||
  }
 | 
			
		||||
  return ptr;
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  NotifyDeletion(_ptr);
 | 
			
		||||
  void *__freeme = Insert(_ptr,bytes,Cpu);
 | 
			
		||||
  if ( __freeme ) { 
 | 
			
		||||
    acceleratorFreeCpu(__freeme);
 | 
			
		||||
    total_host-=bytes;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////
 | 
			
		||||
// call only once
 | 
			
		||||
//////////////////////////////////////////
 | 
			
		||||
void MemoryManager::Init(void)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  char * str;
 | 
			
		||||
  int Nc;
 | 
			
		||||
  int NcS;
 | 
			
		||||
  
 | 
			
		||||
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
 | 
			
		||||
  if ( str ) {
 | 
			
		||||
    Nc = atoi(str);
 | 
			
		||||
    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
 | 
			
		||||
      Ncache[Cpu]=Nc;
 | 
			
		||||
      Ncache[Acc]=Nc;
 | 
			
		||||
      Ncache[Shared]=Nc;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
 | 
			
		||||
  if ( str ) {
 | 
			
		||||
    Nc = atoi(str);
 | 
			
		||||
    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
 | 
			
		||||
      Ncache[CpuSmall]=Nc;
 | 
			
		||||
      Ncache[AccSmall]=Nc;
 | 
			
		||||
      Ncache[SharedSmall]=Nc;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 | 
			
		||||
#ifdef ALLOCATION_CACHE
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_UVM
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
#else
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef ALLOCATION_CACHE
 | 
			
		||||
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
 | 
			
		||||
  int cache = type + small;
 | 
			
		||||
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
 | 
			
		||||
#else
 | 
			
		||||
  return ptr;
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
 | 
			
		||||
{
 | 
			
		||||
  assert(ncache>0);
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  assert(omp_in_parallel()==0);
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
  void * ret = NULL;
 | 
			
		||||
  int v = -1;
 | 
			
		||||
 | 
			
		||||
  for(int e=0;e<ncache;e++) {
 | 
			
		||||
    if ( entries[e].valid==0 ) {
 | 
			
		||||
      v=e; 
 | 
			
		||||
      break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( v==-1 ) {
 | 
			
		||||
    v=victim;
 | 
			
		||||
    victim = (victim+1)%ncache;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( entries[v].valid ) {
 | 
			
		||||
    ret = entries[v].address;
 | 
			
		||||
    entries[v].valid = 0;
 | 
			
		||||
    entries[v].address = NULL;
 | 
			
		||||
    entries[v].bytes = 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  entries[v].address=ptr;
 | 
			
		||||
  entries[v].bytes  =bytes;
 | 
			
		||||
  entries[v].valid  =1;
 | 
			
		||||
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void *MemoryManager::Lookup(size_t bytes,int type)
 | 
			
		||||
{
 | 
			
		||||
#ifdef ALLOCATION_CACHE
 | 
			
		||||
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
 | 
			
		||||
  int cache = type+small;
 | 
			
		||||
  return Lookup(bytes,Entries[cache],Ncache[cache]);
 | 
			
		||||
#else
 | 
			
		||||
  return NULL;
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
 | 
			
		||||
{
 | 
			
		||||
  assert(ncache>0);
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  assert(omp_in_parallel()==0);
 | 
			
		||||
#endif 
 | 
			
		||||
  for(int e=0;e<ncache;e++){
 | 
			
		||||
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
 | 
			
		||||
      entries[e].valid = 0;
 | 
			
		||||
      return entries[e].address;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  return NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										181
									
								
								Grid/allocator/MemoryManager.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										181
									
								
								Grid/allocator/MemoryManager.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,181 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/MemoryManager.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#pragma once
 | 
			
		||||
#include <list> 
 | 
			
		||||
#include <unordered_map>  
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
// Move control to configure.ac and Config.h?
 | 
			
		||||
 | 
			
		||||
#define ALLOCATION_CACHE
 | 
			
		||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
 | 
			
		||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
 | 
			
		||||
 | 
			
		||||
/*Pinning pages is costly*/
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Advise the LatticeAccelerator class
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
enum ViewAdvise {
 | 
			
		||||
 AdviseDefault       = 0x0,    // Regular data
 | 
			
		||||
 AdviseInfrequentUse = 0x1     // Advise that the data is used infrequently.  This can
 | 
			
		||||
                               // significantly influence performance of bulk storage.
 | 
			
		||||
 
 | 
			
		||||
 // AdviseTransient      = 0x2,   // Data will mostly be read.  On some architectures
 | 
			
		||||
                               // enables read-only copies of memory to be kept on
 | 
			
		||||
                               // host and device.
 | 
			
		||||
 | 
			
		||||
 // AdviseAcceleratorWriteDiscard = 0x4  // Field will be written in entirety on device
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// View Access Mode
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
enum ViewMode {
 | 
			
		||||
  AcceleratorRead  = 0x01,
 | 
			
		||||
  AcceleratorWrite = 0x02,
 | 
			
		||||
  AcceleratorWriteDiscard = 0x04,
 | 
			
		||||
  CpuRead  = 0x08,
 | 
			
		||||
  CpuWrite = 0x10,
 | 
			
		||||
  CpuWriteDiscard = 0x10 // same for now
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class MemoryManager {
 | 
			
		||||
private:
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  // For caching recently freed allocations
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  typedef struct { 
 | 
			
		||||
    void *address;
 | 
			
		||||
    size_t bytes;
 | 
			
		||||
    int valid;
 | 
			
		||||
  } AllocationCacheEntry;
 | 
			
		||||
 | 
			
		||||
  static const int NallocCacheMax=128; 
 | 
			
		||||
  static const int NallocType=6;
 | 
			
		||||
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
 | 
			
		||||
  static int Victim[NallocType];
 | 
			
		||||
  static int Ncache[NallocType];
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////
 | 
			
		||||
  // Free pool
 | 
			
		||||
  /////////////////////////////////////////////////
 | 
			
		||||
  static void *Insert(void *ptr,size_t bytes,int type) ;
 | 
			
		||||
  static void *Lookup(size_t bytes,int type) ;
 | 
			
		||||
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
 | 
			
		||||
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
 | 
			
		||||
 | 
			
		||||
  static void *AcceleratorAllocate(size_t bytes);
 | 
			
		||||
  static void  AcceleratorFree    (void *ptr,size_t bytes);
 | 
			
		||||
  static void PrintBytes(void);
 | 
			
		||||
 public:
 | 
			
		||||
  static void Init(void);
 | 
			
		||||
  static void *SharedAllocate(size_t bytes);
 | 
			
		||||
  static void  SharedFree    (void *ptr,size_t bytes);
 | 
			
		||||
  static void *CpuAllocate(size_t bytes);
 | 
			
		||||
  static void  CpuFree    (void *ptr,size_t bytes);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  // Footprint tracking
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  static uint64_t     DeviceBytes;
 | 
			
		||||
  static uint64_t     DeviceLRUBytes;
 | 
			
		||||
  static uint64_t     DeviceMaxBytes;
 | 
			
		||||
  static uint64_t     HostToDeviceBytes;
 | 
			
		||||
  static uint64_t     DeviceToHostBytes;
 | 
			
		||||
  static uint64_t     HostToDeviceXfer;
 | 
			
		||||
  static uint64_t     DeviceToHostXfer;
 | 
			
		||||
 
 | 
			
		||||
 private:
 | 
			
		||||
#ifndef GRID_UVM
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Data tables for ViewCache
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
  typedef std::list<uint64_t> LRU_t;
 | 
			
		||||
  typedef typename LRU_t::iterator LRUiterator;
 | 
			
		||||
  typedef struct { 
 | 
			
		||||
    int        LRU_valid;
 | 
			
		||||
    LRUiterator LRU_entry;
 | 
			
		||||
    uint64_t CpuPtr;
 | 
			
		||||
    uint64_t AccPtr;
 | 
			
		||||
    size_t   bytes;
 | 
			
		||||
    uint32_t transient;
 | 
			
		||||
    uint32_t state;
 | 
			
		||||
    uint32_t accLock;
 | 
			
		||||
    uint32_t cpuLock;
 | 
			
		||||
  } AcceleratorViewEntry;
 | 
			
		||||
  
 | 
			
		||||
  typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
 | 
			
		||||
  typedef typename AccViewTable_t::iterator AccViewTableIterator ;
 | 
			
		||||
 | 
			
		||||
  static AccViewTable_t AccViewTable;
 | 
			
		||||
  static LRU_t LRU;
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////
 | 
			
		||||
  // Device motion
 | 
			
		||||
  /////////////////////////////////////////////////
 | 
			
		||||
  static void  Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 | 
			
		||||
  static void  EvictVictims(uint64_t bytes); // Frees up <bytes>
 | 
			
		||||
  static void  Evict(AcceleratorViewEntry &AccCache);
 | 
			
		||||
  static void  Flush(AcceleratorViewEntry &AccCache);
 | 
			
		||||
  static void  Clone(AcceleratorViewEntry &AccCache);
 | 
			
		||||
  static void  AccDiscard(AcceleratorViewEntry &AccCache);
 | 
			
		||||
  static void  CpuDiscard(AcceleratorViewEntry &AccCache);
 | 
			
		||||
 | 
			
		||||
  //  static void  LRUupdate(AcceleratorViewEntry &AccCache);
 | 
			
		||||
  static void  LRUinsert(AcceleratorViewEntry &AccCache);
 | 
			
		||||
  static void  LRUremove(AcceleratorViewEntry &AccCache);
 | 
			
		||||
  
 | 
			
		||||
  // manage entries in the table
 | 
			
		||||
  static int                  EntryPresent(uint64_t CpuPtr);
 | 
			
		||||
  static void                 EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 | 
			
		||||
  static void                 EntryErase (uint64_t CpuPtr);
 | 
			
		||||
  static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
 | 
			
		||||
  static void                 EntrySet   (uint64_t CpuPtr,AcceleratorViewEntry &entry);
 | 
			
		||||
 | 
			
		||||
  static void     AcceleratorViewClose(uint64_t AccPtr);
 | 
			
		||||
  static uint64_t AcceleratorViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 | 
			
		||||
  static void     CpuViewClose(uint64_t Ptr);
 | 
			
		||||
  static uint64_t CpuViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 | 
			
		||||
#endif
 | 
			
		||||
  static void NotifyDeletion(void * CpuPtr);
 | 
			
		||||
 | 
			
		||||
 public:
 | 
			
		||||
  static void Print(void);
 | 
			
		||||
  static int   isOpen   (void* CpuPtr);
 | 
			
		||||
  static void  ViewClose(void* CpuPtr,ViewMode mode);
 | 
			
		||||
  static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										468
									
								
								Grid/allocator/MemoryManagerCache.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										468
									
								
								Grid/allocator/MemoryManagerCache.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,468 @@
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
 | 
			
		||||
#ifndef GRID_UVM
 | 
			
		||||
 | 
			
		||||
#warning "Using explicit device memory copies"
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
#define dprintf(...)
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////
 | 
			
		||||
// For caching copies of data on device
 | 
			
		||||
////////////////////////////////////////////////////////////
 | 
			
		||||
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
 | 
			
		||||
MemoryManager::LRU_t MemoryManager::LRU;
 | 
			
		||||
  
 | 
			
		||||
////////////////////////////////////////////////////////
 | 
			
		||||
// Footprint tracking
 | 
			
		||||
////////////////////////////////////////////////////////
 | 
			
		||||
uint64_t  MemoryManager::DeviceBytes;
 | 
			
		||||
uint64_t  MemoryManager::DeviceLRUBytes;
 | 
			
		||||
uint64_t  MemoryManager::DeviceMaxBytes = 1024*1024*128;
 | 
			
		||||
uint64_t  MemoryManager::HostToDeviceBytes;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostBytes;
 | 
			
		||||
uint64_t  MemoryManager::HostToDeviceXfer;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostXfer;
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////
 | 
			
		||||
// Priority ordering for unlocked entries
 | 
			
		||||
//  Empty
 | 
			
		||||
//  CpuDirty 
 | 
			
		||||
//  Consistent
 | 
			
		||||
//  AccDirty
 | 
			
		||||
////////////////////////////////////
 | 
			
		||||
#define Empty         (0x0)  /*Entry unoccupied  */
 | 
			
		||||
#define CpuDirty      (0x1)  /*CPU copy is golden, Acc buffer MAY not be allocated*/
 | 
			
		||||
#define Consistent    (0x2)  /*ACC copy AND CPU copy are valid */
 | 
			
		||||
#define AccDirty      (0x4)  /*ACC copy is golden */
 | 
			
		||||
#define EvictNext     (0x8)  /*Priority for eviction*/
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////
 | 
			
		||||
// Mechanics of data table maintenance
 | 
			
		||||
/////////////////////////////////////////////////
 | 
			
		||||
int   MemoryManager::EntryPresent(uint64_t CpuPtr)
 | 
			
		||||
{
 | 
			
		||||
  if(AccViewTable.empty()) return 0;
 | 
			
		||||
 | 
			
		||||
  auto count = AccViewTable.count(CpuPtr);  assert((count==0)||(count==1));
 | 
			
		||||
  return count;
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 | 
			
		||||
{
 | 
			
		||||
  assert(!EntryPresent(CpuPtr));
 | 
			
		||||
  AcceleratorViewEntry AccCache;
 | 
			
		||||
  AccCache.CpuPtr = CpuPtr;
 | 
			
		||||
  AccCache.AccPtr = (uint64_t)NULL;
 | 
			
		||||
  AccCache.bytes  = bytes;
 | 
			
		||||
  AccCache.state  = CpuDirty;
 | 
			
		||||
  AccCache.LRU_valid=0;
 | 
			
		||||
  AccCache.transient=0;
 | 
			
		||||
  AccCache.accLock=0;
 | 
			
		||||
  AccCache.cpuLock=0;
 | 
			
		||||
  AccViewTable[CpuPtr] = AccCache;
 | 
			
		||||
}
 | 
			
		||||
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
 | 
			
		||||
{
 | 
			
		||||
  assert(EntryPresent(CpuPtr));
 | 
			
		||||
  auto AccCacheIterator = AccViewTable.find(CpuPtr);
 | 
			
		||||
  assert(AccCacheIterator!=AccViewTable.end());
 | 
			
		||||
  return AccCacheIterator;
 | 
			
		||||
}
 | 
			
		||||
void MemoryManager::EntryErase(uint64_t CpuPtr)
 | 
			
		||||
{
 | 
			
		||||
  auto AccCache = EntryLookup(CpuPtr);
 | 
			
		||||
  AccViewTable.erase(CpuPtr);
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
  assert(AccCache.LRU_valid==0);
 | 
			
		||||
  if (AccCache.transient) { 
 | 
			
		||||
    LRU.push_back(AccCache.CpuPtr);
 | 
			
		||||
    AccCache.LRU_entry = --LRU.end();
 | 
			
		||||
  } else {
 | 
			
		||||
    LRU.push_front(AccCache.CpuPtr);
 | 
			
		||||
    AccCache.LRU_entry = LRU.begin();
 | 
			
		||||
  }
 | 
			
		||||
  AccCache.LRU_valid = 1;
 | 
			
		||||
  DeviceLRUBytes+=AccCache.bytes;
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
  assert(AccCache.LRU_valid==1);
 | 
			
		||||
  LRU.erase(AccCache.LRU_entry);
 | 
			
		||||
  AccCache.LRU_valid = 0;
 | 
			
		||||
  DeviceLRUBytes-=AccCache.bytes;
 | 
			
		||||
}
 | 
			
		||||
/////////////////////////////////////////////////
 | 
			
		||||
// Accelerator cache motion & consistency logic
 | 
			
		||||
/////////////////////////////////////////////////
 | 
			
		||||
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
  // Remove from Accelerator, remove entry, without flush
 | 
			
		||||
  // Cannot be locked. If allocated Must be in LRU pool.
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
  assert(AccCache.state!=Empty);
 | 
			
		||||
  
 | 
			
		||||
  //  dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
 | 
			
		||||
  assert(AccCache.accLock==0);
 | 
			
		||||
  assert(AccCache.cpuLock==0);
 | 
			
		||||
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
 | 
			
		||||
  if(AccCache.AccPtr) {
 | 
			
		||||
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
 | 
			
		||||
    DeviceBytes   -=AccCache.bytes;
 | 
			
		||||
    LRUremove(AccCache);
 | 
			
		||||
    //    dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
 | 
			
		||||
  }
 | 
			
		||||
  uint64_t CpuPtr = AccCache.CpuPtr;
 | 
			
		||||
  EntryErase(CpuPtr);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Make CPU consistent, remove from Accelerator, remove entry
 | 
			
		||||
  // Cannot be locked. If allocated must be in LRU pool.
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  assert(AccCache.state!=Empty);
 | 
			
		||||
  
 | 
			
		||||
  //  dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
 | 
			
		||||
  assert(AccCache.accLock==0);
 | 
			
		||||
  assert(AccCache.cpuLock==0);
 | 
			
		||||
  if(AccCache.state==AccDirty) {
 | 
			
		||||
    Flush(AccCache);
 | 
			
		||||
  }
 | 
			
		||||
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
 | 
			
		||||
  if(AccCache.AccPtr) {
 | 
			
		||||
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
 | 
			
		||||
    DeviceBytes   -=AccCache.bytes;
 | 
			
		||||
    LRUremove(AccCache);
 | 
			
		||||
    //    dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
 | 
			
		||||
  }
 | 
			
		||||
  uint64_t CpuPtr = AccCache.CpuPtr;
 | 
			
		||||
  EntryErase(CpuPtr);
 | 
			
		||||
}
 | 
			
		||||
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
  assert(AccCache.state==AccDirty);
 | 
			
		||||
  assert(AccCache.cpuLock==0);
 | 
			
		||||
  assert(AccCache.accLock==0);
 | 
			
		||||
  assert(AccCache.AccPtr!=(uint64_t)NULL);
 | 
			
		||||
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
 | 
			
		||||
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
 | 
			
		||||
  //  dprintf("MemoryManager: Flush  %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
 | 
			
		||||
  DeviceToHostBytes+=AccCache.bytes;
 | 
			
		||||
  DeviceToHostXfer++;
 | 
			
		||||
  AccCache.state=Consistent;
 | 
			
		||||
}
 | 
			
		||||
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
  assert(AccCache.state==CpuDirty);
 | 
			
		||||
  assert(AccCache.cpuLock==0);
 | 
			
		||||
  assert(AccCache.accLock==0);
 | 
			
		||||
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
 | 
			
		||||
  if(AccCache.AccPtr==(uint64_t)NULL){
 | 
			
		||||
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
 | 
			
		||||
    DeviceBytes+=AccCache.bytes;
 | 
			
		||||
  }
 | 
			
		||||
  //  dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
 | 
			
		||||
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
 | 
			
		||||
  HostToDeviceBytes+=AccCache.bytes;
 | 
			
		||||
  HostToDeviceXfer++;
 | 
			
		||||
  AccCache.state=Consistent;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
  assert(AccCache.state!=Empty);
 | 
			
		||||
  assert(AccCache.cpuLock==0);
 | 
			
		||||
  assert(AccCache.accLock==0);
 | 
			
		||||
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
 | 
			
		||||
  if(AccCache.AccPtr==(uint64_t)NULL){
 | 
			
		||||
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
 | 
			
		||||
    DeviceBytes+=AccCache.bytes;
 | 
			
		||||
  }
 | 
			
		||||
  AccCache.state=AccDirty;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// View management
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 | 
			
		||||
{
 | 
			
		||||
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
 | 
			
		||||
    AcceleratorViewClose((uint64_t)Ptr);
 | 
			
		||||
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
 | 
			
		||||
    CpuViewClose((uint64_t)Ptr);
 | 
			
		||||
  } else { 
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 | 
			
		||||
{
 | 
			
		||||
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
 | 
			
		||||
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
 | 
			
		||||
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
 | 
			
		||||
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
 | 
			
		||||
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
 | 
			
		||||
  } else { 
 | 
			
		||||
    assert(0);
 | 
			
		||||
    return NULL;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::EvictVictims(uint64_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
 | 
			
		||||
    if ( DeviceLRUBytes > 0){
 | 
			
		||||
      assert(LRU.size()>0);
 | 
			
		||||
      uint64_t victim = LRU.back();
 | 
			
		||||
      auto AccCacheIterator = EntryLookup(victim);
 | 
			
		||||
      auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
      Evict(AccCache);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 | 
			
		||||
{
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Find if present, otherwise get or force an empty
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  if ( EntryPresent(CpuPtr)==0 ){
 | 
			
		||||
    EvictVictims(bytes);
 | 
			
		||||
    EntryCreate(CpuPtr,bytes,mode,hint);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
			
		||||
  auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
  
 | 
			
		||||
  assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
 | 
			
		||||
 | 
			
		||||
  assert(AccCache.cpuLock==0);  // Programming error
 | 
			
		||||
 | 
			
		||||
  if(AccCache.state!=Empty) {
 | 
			
		||||
    assert(AccCache.CpuPtr == CpuPtr);
 | 
			
		||||
    assert(AccCache.bytes  ==bytes);
 | 
			
		||||
  }
 | 
			
		||||
/*
 | 
			
		||||
 *  State transitions and actions
 | 
			
		||||
 *
 | 
			
		||||
 *  Action  State   StateNext         Flush    Clone
 | 
			
		||||
 *
 | 
			
		||||
 *  AccRead  Empty   Consistent        -        Y
 | 
			
		||||
 *  AccWrite Empty   AccDirty          -        Y
 | 
			
		||||
 *  AccRead  CpuDirty Consistent       -        Y
 | 
			
		||||
 *  AccWrite CpuDirty AccDirty         -        Y
 | 
			
		||||
 *  AccRead  Consistent Consistent     -        - 
 | 
			
		||||
 *  AccWrite Consistent AccDirty       -        - 
 | 
			
		||||
 *  AccRead  AccDirty   AccDirty       -        - 
 | 
			
		||||
 *  AccWrite AccDirty   AccDirty       -        - 
 | 
			
		||||
 */
 | 
			
		||||
  if(AccCache.state==Empty) {
 | 
			
		||||
    assert(AccCache.LRU_valid==0);
 | 
			
		||||
    AccCache.CpuPtr = CpuPtr;
 | 
			
		||||
    AccCache.AccPtr = (uint64_t)NULL;
 | 
			
		||||
    AccCache.bytes  = bytes;
 | 
			
		||||
    AccCache.state  = CpuDirty;   // Cpu starts primary
 | 
			
		||||
    if(mode==AcceleratorWriteDiscard){
 | 
			
		||||
      CpuDiscard(AccCache);
 | 
			
		||||
      AccCache.state  = AccDirty;   // Empty + AcceleratorWrite=> AccDirty
 | 
			
		||||
    } else if(mode==AcceleratorWrite){
 | 
			
		||||
      Clone(AccCache);
 | 
			
		||||
      AccCache.state  = AccDirty;   // Empty + AcceleratorWrite=> AccDirty
 | 
			
		||||
    } else {
 | 
			
		||||
      Clone(AccCache);
 | 
			
		||||
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
 | 
			
		||||
    }
 | 
			
		||||
    AccCache.accLock= 1;
 | 
			
		||||
  } else if(AccCache.state==CpuDirty ){
 | 
			
		||||
    if(mode==AcceleratorWriteDiscard) {
 | 
			
		||||
      CpuDiscard(AccCache);
 | 
			
		||||
      AccCache.state  = AccDirty;   // CpuDirty + AcceleratorWrite=> AccDirty
 | 
			
		||||
    } else if(mode==AcceleratorWrite) {
 | 
			
		||||
      Clone(AccCache);
 | 
			
		||||
      AccCache.state  = AccDirty;   // CpuDirty + AcceleratorWrite=> AccDirty
 | 
			
		||||
    } else {
 | 
			
		||||
      Clone(AccCache);
 | 
			
		||||
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
 | 
			
		||||
    }
 | 
			
		||||
    AccCache.accLock++;
 | 
			
		||||
    //    printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
 | 
			
		||||
  } else if(AccCache.state==Consistent) {
 | 
			
		||||
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
 | 
			
		||||
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
 | 
			
		||||
    else
 | 
			
		||||
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
 | 
			
		||||
    AccCache.accLock++;
 | 
			
		||||
    //    printf("Consistent entry into device accLock %d\n",AccCache.accLock);
 | 
			
		||||
  } else if(AccCache.state==AccDirty) {
 | 
			
		||||
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
 | 
			
		||||
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
 | 
			
		||||
    else
 | 
			
		||||
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
 | 
			
		||||
    AccCache.accLock++;
 | 
			
		||||
    //    printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
 | 
			
		||||
  } else {
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // If view is opened on device remove from LRU
 | 
			
		||||
  if(AccCache.LRU_valid==1){
 | 
			
		||||
    // must possibly remove from LRU as now locked on GPU
 | 
			
		||||
    LRUremove(AccCache);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int transient =hint;
 | 
			
		||||
  AccCache.transient= transient? EvictNext : 0;
 | 
			
		||||
 | 
			
		||||
  return AccCache.AccPtr;
 | 
			
		||||
}
 | 
			
		||||
////////////////////////////////////
 | 
			
		||||
// look up & decrement lock count
 | 
			
		||||
////////////////////////////////////
 | 
			
		||||
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
 | 
			
		||||
{
 | 
			
		||||
  auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
			
		||||
  auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
 | 
			
		||||
  assert(AccCache.cpuLock==0);
 | 
			
		||||
  assert(AccCache.accLock>0);
 | 
			
		||||
 | 
			
		||||
  AccCache.accLock--;
 | 
			
		||||
 | 
			
		||||
  // Move to LRU queue if not locked and close on device
 | 
			
		||||
  if(AccCache.accLock==0) {
 | 
			
		||||
    LRUinsert(AccCache);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
 | 
			
		||||
{
 | 
			
		||||
  auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
			
		||||
  auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
 | 
			
		||||
  assert(AccCache.cpuLock>0);
 | 
			
		||||
  assert(AccCache.accLock==0);
 | 
			
		||||
 | 
			
		||||
  AccCache.cpuLock--;
 | 
			
		||||
}
 | 
			
		||||
/*
 | 
			
		||||
 *  Action  State   StateNext         Flush    Clone
 | 
			
		||||
 *
 | 
			
		||||
 *  CpuRead  Empty   CpuDirty          -        -
 | 
			
		||||
 *  CpuWrite Empty   CpuDirty          -        -
 | 
			
		||||
 *  CpuRead  CpuDirty CpuDirty         -        -
 | 
			
		||||
 *  CpuWrite CpuDirty CpuDirty         -        - 
 | 
			
		||||
 *  CpuRead  Consistent Consistent     -        - 
 | 
			
		||||
 *  CpuWrite Consistent CpuDirty       -        - 
 | 
			
		||||
 *  CpuRead  AccDirty   Consistent     Y        -
 | 
			
		||||
 *  CpuWrite AccDirty   CpuDirty       Y        -
 | 
			
		||||
 */
 | 
			
		||||
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
 | 
			
		||||
{
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Find if present, otherwise get or force an empty
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  if ( EntryPresent(CpuPtr)==0 ){
 | 
			
		||||
    EvictVictims(bytes);
 | 
			
		||||
    EntryCreate(CpuPtr,bytes,mode,transient);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
			
		||||
  auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
  
 | 
			
		||||
  assert((mode==CpuRead)||(mode==CpuWrite));
 | 
			
		||||
  assert(AccCache.accLock==0);  // Programming error
 | 
			
		||||
 | 
			
		||||
  if(AccCache.state!=Empty) {
 | 
			
		||||
    assert(AccCache.CpuPtr == CpuPtr);
 | 
			
		||||
    assert(AccCache.bytes==bytes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if(AccCache.state==Empty) {
 | 
			
		||||
    AccCache.CpuPtr = CpuPtr;
 | 
			
		||||
    AccCache.AccPtr = (uint64_t)NULL;
 | 
			
		||||
    AccCache.bytes  = bytes;
 | 
			
		||||
    AccCache.state  = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
 | 
			
		||||
    AccCache.accLock= 0;
 | 
			
		||||
    AccCache.cpuLock= 1;
 | 
			
		||||
  } else if(AccCache.state==CpuDirty ){
 | 
			
		||||
    // AccPtr dont care, deferred allocate
 | 
			
		||||
    AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
 | 
			
		||||
    AccCache.cpuLock++;
 | 
			
		||||
  } else if(AccCache.state==Consistent) {
 | 
			
		||||
    assert(AccCache.AccPtr != (uint64_t)NULL);
 | 
			
		||||
    if(mode==CpuWrite)
 | 
			
		||||
      AccCache.state = CpuDirty;   // Consistent +CpuWrite => CpuDirty
 | 
			
		||||
    else 
 | 
			
		||||
      AccCache.state = Consistent; // Consistent +CpuRead  => Consistent
 | 
			
		||||
    AccCache.cpuLock++;
 | 
			
		||||
  } else if(AccCache.state==AccDirty) {
 | 
			
		||||
    assert(AccCache.AccPtr != (uint64_t)NULL);
 | 
			
		||||
    Flush(AccCache);
 | 
			
		||||
    if(mode==CpuWrite) AccCache.state = CpuDirty;   // AccDirty +CpuWrite => CpuDirty, Flush
 | 
			
		||||
    else            AccCache.state = Consistent; // AccDirty +CpuRead  => Consistent, Flush
 | 
			
		||||
    AccCache.cpuLock++;
 | 
			
		||||
  } else {
 | 
			
		||||
    assert(0); // should be unreachable
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  AccCache.transient= transient? EvictNext : 0;
 | 
			
		||||
 | 
			
		||||
  return AccCache.CpuPtr;
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::NotifyDeletion(void *_ptr)
 | 
			
		||||
{
 | 
			
		||||
  // Look up in ViewCache
 | 
			
		||||
  uint64_t ptr = (uint64_t)_ptr;
 | 
			
		||||
  if(EntryPresent(ptr)) {
 | 
			
		||||
    auto e = EntryLookup(ptr);
 | 
			
		||||
    AccDiscard(e->second);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::Print(void)
 | 
			
		||||
{
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceBytes   << " bytes allocated on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device       " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << HostToDeviceXfer << " transfers        to   device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceToHostXfer << " transfers        from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
 | 
			
		||||
    auto &AccCache = it->second;
 | 
			
		||||
    
 | 
			
		||||
    std::string str;
 | 
			
		||||
    if ( AccCache.state==Empty    ) str = std::string("Empty");
 | 
			
		||||
    if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
 | 
			
		||||
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
 | 
			
		||||
    if ( AccCache.state==Consistent)str = std::string("Consistent");
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
	      << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 | 
			
		||||
	      << "\t" << AccCache.cpuLock
 | 
			
		||||
	      << "\t" << AccCache.accLock
 | 
			
		||||
	      << "\t" << AccCache.LRU_valid<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
int   MemoryManager::isOpen   (void* _CpuPtr) 
 | 
			
		||||
{ 
 | 
			
		||||
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
 | 
			
		||||
  if ( EntryPresent(CpuPtr) ){
 | 
			
		||||
    auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
			
		||||
    auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
    return AccCache.cpuLock+AccCache.accLock;
 | 
			
		||||
  } else { 
 | 
			
		||||
    return 0;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										24
									
								
								Grid/allocator/MemoryManagerShared.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								Grid/allocator/MemoryManagerShared.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,24 @@
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
#ifdef GRID_UVM
 | 
			
		||||
 | 
			
		||||
#warning "Grid is assuming unified virtual memory address space"
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// View management is 1:1 address space mapping
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
uint64_t  MemoryManager::DeviceBytes;
 | 
			
		||||
uint64_t  MemoryManager::DeviceLRUBytes;
 | 
			
		||||
uint64_t  MemoryManager::DeviceMaxBytes = 1024*1024*128;
 | 
			
		||||
uint64_t  MemoryManager::HostToDeviceBytes;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostBytes;
 | 
			
		||||
uint64_t  MemoryManager::HostToDeviceXfer;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostXfer;
 | 
			
		||||
 | 
			
		||||
void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 | 
			
		||||
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 | 
			
		||||
int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
 | 
			
		||||
void  MemoryManager::Print(void){};
 | 
			
		||||
void  MemoryManager::NotifyDeletion(void *ptr){};
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										67
									
								
								Grid/allocator/MemoryStats.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								Grid/allocator/MemoryStats.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,67 @@
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
#include <fcntl.h>
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
MemoryStats *MemoryProfiler::stats = nullptr;
 | 
			
		||||
bool         MemoryProfiler::debug = false;
 | 
			
		||||
 | 
			
		||||
void check_huge_pages(void *Buf,uint64_t BYTES)
 | 
			
		||||
{
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
  int fd = open("/proc/self/pagemap", O_RDONLY);
 | 
			
		||||
  assert(fd >= 0);
 | 
			
		||||
  const int page_size = 4096;
 | 
			
		||||
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
 | 
			
		||||
  off_t offset = sizeof(uint64_t) * virt_pfn;
 | 
			
		||||
  uint64_t npages = (BYTES + page_size-1) / page_size;
 | 
			
		||||
  uint64_t pagedata[npages];
 | 
			
		||||
  uint64_t ret = lseek(fd, offset, SEEK_SET);
 | 
			
		||||
  assert(ret == offset);
 | 
			
		||||
  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
 | 
			
		||||
  assert(ret == sizeof(uint64_t) * npages);
 | 
			
		||||
  int nhugepages = npages / 512;
 | 
			
		||||
  int n4ktotal, nnothuge;
 | 
			
		||||
  n4ktotal = 0;
 | 
			
		||||
  nnothuge = 0;
 | 
			
		||||
  for (int i = 0; i < nhugepages; ++i) {
 | 
			
		||||
    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
 | 
			
		||||
    for (int j = 0; j < 512; ++j) {
 | 
			
		||||
      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
 | 
			
		||||
      ++n4ktotal;
 | 
			
		||||
      if (pageaddr != baseaddr + j * page_size)
 | 
			
		||||
	++nnothuge;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  int rank = CartesianCommunicator::RankWorld();
 | 
			
		||||
  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::string sizeString(const size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  constexpr unsigned int bufSize = 256;
 | 
			
		||||
  const char             *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
 | 
			
		||||
  char                   buf[256];
 | 
			
		||||
  size_t                 s     = 0;
 | 
			
		||||
  double                 count = bytes;
 | 
			
		||||
  
 | 
			
		||||
  while (count >= 1024 && s < 7)
 | 
			
		||||
    {
 | 
			
		||||
      s++;
 | 
			
		||||
      count /= 1024;
 | 
			
		||||
    }
 | 
			
		||||
  if (count - floor(count) == 0.0)
 | 
			
		||||
    {
 | 
			
		||||
      snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
 | 
			
		||||
    }
 | 
			
		||||
  else
 | 
			
		||||
    {
 | 
			
		||||
      snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
 | 
			
		||||
    }
 | 
			
		||||
  
 | 
			
		||||
  return std::string(buf);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										95
									
								
								Grid/allocator/MemoryStats.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										95
									
								
								Grid/allocator/MemoryStats.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,95 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/MemoryStats.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
std::string sizeString(size_t bytes);
 | 
			
		||||
 | 
			
		||||
struct MemoryStats
 | 
			
		||||
{
 | 
			
		||||
  size_t totalAllocated{0}, maxAllocated{0}, 
 | 
			
		||||
    currentlyAllocated{0}, totalFreed{0};
 | 
			
		||||
};
 | 
			
		||||
    
 | 
			
		||||
class MemoryProfiler
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  static MemoryStats *stats;
 | 
			
		||||
  static bool        debug;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
 | 
			
		||||
#define profilerDebugPrint						\
 | 
			
		||||
  if (MemoryProfiler::stats)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      auto s = MemoryProfiler::stats;					\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] total  : " << memString(s->totalAllocated) \
 | 
			
		||||
		<< std::endl;						\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] max    : " << memString(s->maxAllocated) \
 | 
			
		||||
		<< std::endl;						\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
 | 
			
		||||
		<< std::endl;						\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] freed  : " << memString(s->totalFreed) \
 | 
			
		||||
		<< std::endl;						\
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#define profilerAllocate(bytes)						\
 | 
			
		||||
  if (MemoryProfiler::stats)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      auto s = MemoryProfiler::stats;					\
 | 
			
		||||
      s->totalAllocated     += (bytes);					\
 | 
			
		||||
      s->currentlyAllocated += (bytes);					\
 | 
			
		||||
      s->maxAllocated        = std::max(s->maxAllocated, s->currentlyAllocated); \
 | 
			
		||||
    }									\
 | 
			
		||||
  if (MemoryProfiler::debug)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
 | 
			
		||||
      profilerDebugPrint;						\
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#define profilerFree(bytes)						\
 | 
			
		||||
  if (MemoryProfiler::stats)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      auto s = MemoryProfiler::stats;					\
 | 
			
		||||
      s->totalFreed         += (bytes);					\
 | 
			
		||||
      s->currentlyAllocated -= (bytes);					\
 | 
			
		||||
    }									\
 | 
			
		||||
  if (MemoryProfiler::debug)						\
 | 
			
		||||
    {									\
 | 
			
		||||
      std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
 | 
			
		||||
      profilerDebugPrint;						\
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
void check_huge_pages(void *Buf,uint64_t BYTES);
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
@@ -81,6 +81,7 @@ public:
 | 
			
		||||
 | 
			
		||||
  bool _isCheckerBoarded; 
 | 
			
		||||
  int        LocallyPeriodic;
 | 
			
		||||
  Coordinate _checker_dim_mask;
 | 
			
		||||
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
 | 
			
		||||
 | 
			
		||||
public:
 | 
			
		||||
  int dummy;
 | 
			
		||||
  Coordinate _checker_dim_mask;
 | 
			
		||||
  virtual int  CheckerBoardFromOindexTable (int Oindex) {
 | 
			
		||||
    return 0;
 | 
			
		||||
  }
 | 
			
		||||
@@ -104,6 +105,7 @@ public:
 | 
			
		||||
    _ldimensions.resize(_ndimension);
 | 
			
		||||
    _rdimensions.resize(_ndimension);
 | 
			
		||||
    _simd_layout.resize(_ndimension);
 | 
			
		||||
    _checker_dim_mask.resize(_ndimension);;
 | 
			
		||||
    _lstart.resize(_ndimension);
 | 
			
		||||
    _lend.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
@@ -114,6 +116,8 @@ public:
 | 
			
		||||
 | 
			
		||||
    for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
	_checker_dim_mask[d]=0;
 | 
			
		||||
 | 
			
		||||
        _fdimensions[d] = dimensions[d];   // Global dimensions
 | 
			
		||||
        _gdimensions[d] = _fdimensions[d]; // Global dimensions
 | 
			
		||||
        _simd_layout[d] = simd_layout[d];
 | 
			
		||||
 
 | 
			
		||||
@@ -35,12 +35,28 @@ static const int CbRed  =0;
 | 
			
		||||
static const int CbBlack=1;
 | 
			
		||||
static const int Even   =CbRed;
 | 
			
		||||
static const int Odd    =CbBlack;
 | 
			
		||||
 | 
			
		||||
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
 | 
			
		||||
{
 | 
			
		||||
  int nd=rdim.size();
 | 
			
		||||
  Coordinate coor(nd);
 | 
			
		||||
 | 
			
		||||
  Lexicographic::CoorFromIndex(coor,oindex,rdim);
 | 
			
		||||
 | 
			
		||||
  int linear=0;
 | 
			
		||||
  for(int d=0;d<nd;d++){
 | 
			
		||||
    if(chk_dim_msk[d])
 | 
			
		||||
      linear=linear+coor[d];
 | 
			
		||||
  }
 | 
			
		||||
  return (linear&0x1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
// Specialise this for red black grids storing half the data like a chess board.
 | 
			
		||||
class GridRedBlackCartesian : public GridBase
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  Coordinate _checker_dim_mask;
 | 
			
		||||
  //  Coordinate _checker_dim_mask;
 | 
			
		||||
  int              _checker_dim;
 | 
			
		||||
  std::vector<int> _checker_board;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
#include <pwd.h>
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
#include <cuda_runtime_api.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@@ -170,17 +170,24 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
 | 
			
		||||
  std::vector<int> primes({2,3,5});
 | 
			
		||||
 | 
			
		||||
  int dim = 0;
 | 
			
		||||
  int last_dim = ndimension - 1;
 | 
			
		||||
  int AutoShmSize = 1;
 | 
			
		||||
  while(AutoShmSize != WorldShmSize) {
 | 
			
		||||
    for(int p=0;p<primes.size();p++) {
 | 
			
		||||
    int p;
 | 
			
		||||
    for(p=0;p<primes.size();p++) {
 | 
			
		||||
      int prime=primes[p];
 | 
			
		||||
      if ( divides(prime,WorldDims[dim]/ShmDims[dim])
 | 
			
		||||
        && divides(prime,WorldShmSize/AutoShmSize)  ) {
 | 
			
		||||
	AutoShmSize*=prime;
 | 
			
		||||
	ShmDims[dim]*=prime;
 | 
			
		||||
	last_dim = dim;
 | 
			
		||||
	break;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    if (p == primes.size() && last_dim == dim) {
 | 
			
		||||
      std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
 | 
			
		||||
      exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
    dim=(dim+1) %ndimension;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -413,7 +420,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Hugetlbfs mapping intended
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
			
		||||
{
 | 
			
		||||
  void * ShmCommBuf ; 
 | 
			
		||||
@@ -433,13 +440,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  //  cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_IBM_SUMMIT
 | 
			
		||||
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
 | 
			
		||||
    std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
 | 
			
		||||
#else
 | 
			
		||||
    std::cout << "setting device to WorldShmRank"<<std::endl;
 | 
			
		||||
    cudaSetDevice(WorldShmRank);
 | 
			
		||||
#endif
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Each MPI rank should allocate our own buffer
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -677,7 +677,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
  cudaMemset(dest,0,bytes);
 | 
			
		||||
#else
 | 
			
		||||
  bzero(dest,bytes);
 | 
			
		||||
@@ -685,7 +685,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 | 
			
		||||
}
 | 
			
		||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
  cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
 | 
			
		||||
#else   
 | 
			
		||||
  bcopy(src,dest,bytes);
 | 
			
		||||
 
 | 
			
		||||
@@ -29,6 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
extern Vector<std::pair<int,int> > Cshift_table; 
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there is no need to SIMD split 
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
 | 
			
		||||
  int e2=rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
  int ent = 0;
 | 
			
		||||
 | 
			
		||||
  static Vector<std::pair<int,int> > table; table.resize(e1*e2);
 | 
			
		||||
  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
 | 
			
		||||
 | 
			
		||||
  int stride=rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  if ( cbmask == 0x3 ) { 
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o  = n*stride;
 | 
			
		||||
	int bo = n*e2;
 | 
			
		||||
	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
 | 
			
		||||
	Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
@@ -65,14 +67,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
 | 
			
		||||
	 int o  = n*stride;
 | 
			
		||||
	 int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
 | 
			
		||||
	 if ( ocb &cbmask ) {
 | 
			
		||||
	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
 | 
			
		||||
	   Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
 | 
			
		||||
	 }
 | 
			
		||||
       }
 | 
			
		||||
     }
 | 
			
		||||
  }
 | 
			
		||||
  thread_for(i,ent,{
 | 
			
		||||
    buffer[table[i].first]=rhs_v[table[i].second];
 | 
			
		||||
  });
 | 
			
		||||
  {
 | 
			
		||||
    autoView(rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
    auto buffer_p = & buffer[0];
 | 
			
		||||
    auto table = &Cshift_table[0];
 | 
			
		||||
    accelerator_for(i,ent,1,{
 | 
			
		||||
      buffer_p[table[i].first]=rhs_v[table[i].second];
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -95,36 +102,38 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 | 
			
		||||
  int e2=rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
  int n1=rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  if ( cbmask ==0x3){
 | 
			
		||||
    thread_for_collapse(2,n,e1,{
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
 | 
			
		||||
    autoView(rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
    accelerator_for2d(n,e1,b,e2,1,{
 | 
			
		||||
	int o      =   n*n1;
 | 
			
		||||
	int offset = b+n*e2;
 | 
			
		||||
	
 | 
			
		||||
	vobj temp =rhs_v[so+o+b];
 | 
			
		||||
	extract<vobj>(temp,pointers,offset);
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
      });
 | 
			
		||||
  } else { 
 | 
			
		||||
    autoView(rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
 | 
			
		||||
    // Test_cshift_red_black code.
 | 
			
		||||
    std::cout << " Dense packed buffer WARNING " <<std::endl;
 | 
			
		||||
    thread_for_collapse(2,n,e1,{
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
    Coordinate rdim=rhs.Grid()->_rdimensions;
 | 
			
		||||
    Coordinate cdm =rhs.Grid()->_checker_dim_mask;
 | 
			
		||||
    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
 | 
			
		||||
    accelerator_for2d(n,e1,b,e2,1,{
 | 
			
		||||
 | 
			
		||||
	Coordinate coor;
 | 
			
		||||
 | 
			
		||||
	int o=n*n1;
 | 
			
		||||
	int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
 | 
			
		||||
	int oindex = o+b;
 | 
			
		||||
 | 
			
		||||
       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
 | 
			
		||||
 | 
			
		||||
	int ocb=1<<cb;
 | 
			
		||||
	int offset = b+n*e2;
 | 
			
		||||
 | 
			
		||||
	if ( ocb & cbmask ) {
 | 
			
		||||
	  vobj temp =rhs_v[so+o+b];
 | 
			
		||||
	  extract<vobj>(temp,pointers,offset);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
      });
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -145,7 +154,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
 | 
			
		||||
  int e2=rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
  int stride=rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
 | 
			
		||||
  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
 | 
			
		||||
  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
 | 
			
		||||
 | 
			
		||||
  int ent    =0;
 | 
			
		||||
 | 
			
		||||
  if ( cbmask ==0x3 ) {
 | 
			
		||||
@@ -154,7 +164,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o   =n*rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
	int bo  =n*rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
	table[ent++] = std::pair<int,int>(so+o+b,bo+b);
 | 
			
		||||
	Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -165,16 +175,20 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
 | 
			
		||||
	int o   =n*rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
	int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
			
		||||
	if ( ocb & cbmask ) {
 | 
			
		||||
	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
 | 
			
		||||
	  Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  thread_for(i,ent,{
 | 
			
		||||
    rhs_v[table[i].first]=buffer[table[i].second];
 | 
			
		||||
  });
 | 
			
		||||
  {
 | 
			
		||||
    autoView( rhs_v, rhs, AcceleratorWrite);
 | 
			
		||||
    auto buffer_p = & buffer[0];
 | 
			
		||||
    auto table = &Cshift_table[0];
 | 
			
		||||
    accelerator_for(i,ent,1,{
 | 
			
		||||
	rhs_v[table[i].first]=buffer_p[table[i].second];
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
@@ -194,21 +208,19 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 | 
			
		||||
  int e2=rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
 | 
			
		||||
  if(cbmask ==0x3 ) {
 | 
			
		||||
    auto rhs_v = rhs.View();
 | 
			
		||||
    thread_for_collapse(2,n,e1,{
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
    autoView( rhs_v , rhs, AcceleratorWrite);
 | 
			
		||||
    accelerator_for2d(n,e1,b,e2,1,{
 | 
			
		||||
	int o      = n*rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
	int offset = b+n*rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
	merge(rhs_v[so+o+b],pointers,offset);
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
      });
 | 
			
		||||
  } else { 
 | 
			
		||||
 | 
			
		||||
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
 | 
			
		||||
    // Test_cshift_red_black code.
 | 
			
		||||
    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
 | 
			
		||||
    std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
 | 
			
		||||
    auto rhs_v = rhs.View();
 | 
			
		||||
    autoView( rhs_v, rhs, CpuWrite);
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o      = n*rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
@@ -225,6 +237,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
// local to node block strided copies
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs.Grid()->_rdimensions[dimension];
 | 
			
		||||
@@ -239,14 +252,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
 | 
			
		||||
  int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
 | 
			
		||||
  int e2=rhs.Grid()->_slice_block[dimension];
 | 
			
		||||
  int stride = rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
 | 
			
		||||
 | 
			
		||||
  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
 | 
			
		||||
 | 
			
		||||
  int ent=0;
 | 
			
		||||
 | 
			
		||||
  if(cbmask == 0x3 ){
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
        int o =n*stride+b;
 | 
			
		||||
	table[ent++] = std::pair<int,int>(lo+o,ro+o);
 | 
			
		||||
	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
@@ -255,23 +270,24 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
 | 
			
		||||
        int o =n*stride+b;
 | 
			
		||||
        int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
 | 
			
		||||
        if ( ocb&cbmask ) {
 | 
			
		||||
	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
 | 
			
		||||
	  Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  thread_for(i,ent,{
 | 
			
		||||
    lhs_v[table[i].first]=rhs_v[table[i].second];
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    autoView(rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
    autoView(lhs_v , lhs, AcceleratorWrite);
 | 
			
		||||
    auto table = &Cshift_table[0];
 | 
			
		||||
    accelerator_for(i,ent,1,{
 | 
			
		||||
      lhs_v[table[i].first]=rhs_v[table[i].second];
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
  int rd = rhs.Grid()->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
 | 
			
		||||
@@ -285,29 +301,33 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
 | 
			
		||||
  int e2=rhs.Grid()->_slice_block [dimension];
 | 
			
		||||
  int stride = rhs.Grid()->_slice_stride[dimension];
 | 
			
		||||
 | 
			
		||||
  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
 | 
			
		||||
  if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
 | 
			
		||||
 | 
			
		||||
  int ent=0;
 | 
			
		||||
 | 
			
		||||
  if ( cbmask == 0x3 ) {
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    for(int b=0;b<e2;b++){
 | 
			
		||||
      int o  =n*stride;
 | 
			
		||||
      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
 | 
			
		||||
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
 | 
			
		||||
    }}
 | 
			
		||||
  } else {
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    for(int b=0;b<e2;b++){
 | 
			
		||||
      int o  =n*stride;
 | 
			
		||||
      int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
 | 
			
		||||
      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
 | 
			
		||||
      if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
 | 
			
		||||
    }}
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  thread_for(i,ent,{
 | 
			
		||||
    permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
 | 
			
		||||
  });
 | 
			
		||||
  {
 | 
			
		||||
    autoView( rhs_v, rhs, AcceleratorRead);
 | 
			
		||||
    autoView( lhs_v, lhs, AcceleratorWrite);
 | 
			
		||||
    auto table = &Cshift_table[0];
 | 
			
		||||
    accelerator_for(i,ent,1,{
 | 
			
		||||
      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										4
									
								
								Grid/cshift/Cshift_table.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								Grid/cshift/Cshift_table.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,4 @@
 | 
			
		||||
#include <Grid/GridCore.h>       
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
Vector<std::pair<int,int> > Cshift_table; 
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
@@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#pragma once
 | 
			
		||||
#include <Grid/lattice/Lattice_view.h>
 | 
			
		||||
#include <Grid/lattice/Lattice_base.h>
 | 
			
		||||
#include <Grid/lattice/Lattice_conformable.h>
 | 
			
		||||
#include <Grid/lattice/Lattice_ET.h>
 | 
			
		||||
 
 | 
			
		||||
@@ -92,12 +92,18 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
 | 
			
		||||
{
 | 
			
		||||
  return arg[ss];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// What needs this?
 | 
			
		||||
// Cannot be legal on accelerator
 | 
			
		||||
// Comparison must convert
 | 
			
		||||
#if 1
 | 
			
		||||
template <class lobj> accelerator_inline 
 | 
			
		||||
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) 
 | 
			
		||||
{
 | 
			
		||||
  auto view = arg.AcceleratorView(ViewRead);
 | 
			
		||||
  auto view = arg.View(AcceleratorRead);
 | 
			
		||||
  return view[ss];
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////
 | 
			
		||||
// handle nodes in syntax tree- eval one operand
 | 
			
		||||
@@ -180,16 +186,12 @@ inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
 | 
			
		||||
  cb = lat.Checkerboard();
 | 
			
		||||
}
 | 
			
		||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 | 
			
		||||
inline void CBFromExpression(int &cb, const T1 ¬lat)  // non-lattice leaf
 | 
			
		||||
{
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void CBFromExpression(int &cb, const T1 ¬lat) {} // non-lattice leaf
 | 
			
		||||
template <typename Op, typename T1> inline 
 | 
			
		||||
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr) 
 | 
			
		||||
{
 | 
			
		||||
  CBFromExpression(cb, expr.arg1);  // recurse AST
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename T1, typename T2> inline 
 | 
			
		||||
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr) 
 | 
			
		||||
{
 | 
			
		||||
@@ -204,6 +206,68 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
 | 
			
		||||
  CBFromExpression(cb, expr.arg3);  // recurse AST
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// ViewOpen
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
 | 
			
		||||
inline void ExpressionViewOpen(T1 &lat)  // Lattice leaf
 | 
			
		||||
{
 | 
			
		||||
  lat.ViewOpen(AcceleratorRead);
 | 
			
		||||
}
 | 
			
		||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 | 
			
		||||
  inline void ExpressionViewOpen(T1 ¬lat) {}
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename T1> inline 
 | 
			
		||||
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr) 
 | 
			
		||||
{  
 | 
			
		||||
  ExpressionViewOpen(expr.arg1); // recurse AST
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename T1, typename T2> inline 
 | 
			
		||||
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr) 
 | 
			
		||||
{
 | 
			
		||||
  ExpressionViewOpen(expr.arg1);  // recurse AST
 | 
			
		||||
  ExpressionViewOpen(expr.arg2);  // recurse AST
 | 
			
		||||
}
 | 
			
		||||
template <typename Op, typename T1, typename T2, typename T3>
 | 
			
		||||
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
 | 
			
		||||
{
 | 
			
		||||
  ExpressionViewOpen(expr.arg1);  // recurse AST
 | 
			
		||||
  ExpressionViewOpen(expr.arg2);  // recurse AST
 | 
			
		||||
  ExpressionViewOpen(expr.arg3);  // recurse AST
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// ViewClose
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
 | 
			
		||||
inline void ExpressionViewClose( T1 &lat)  // Lattice leaf
 | 
			
		||||
{
 | 
			
		||||
  lat.ViewClose();
 | 
			
		||||
}
 | 
			
		||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 | 
			
		||||
inline void ExpressionViewClose(T1 ¬lat) {}
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename T1> inline 
 | 
			
		||||
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr) 
 | 
			
		||||
{  
 | 
			
		||||
  ExpressionViewClose(expr.arg1); // recurse AST
 | 
			
		||||
}
 | 
			
		||||
template <typename Op, typename T1, typename T2> inline 
 | 
			
		||||
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr) 
 | 
			
		||||
{
 | 
			
		||||
  ExpressionViewClose(expr.arg1);  // recurse AST
 | 
			
		||||
  ExpressionViewClose(expr.arg2);  // recurse AST
 | 
			
		||||
}
 | 
			
		||||
template <typename Op, typename T1, typename T2, typename T3>
 | 
			
		||||
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
 | 
			
		||||
{
 | 
			
		||||
  ExpressionViewClose(expr.arg1);  // recurse AST
 | 
			
		||||
  ExpressionViewClose(expr.arg2);  // recurse AST
 | 
			
		||||
  ExpressionViewClose(expr.arg3);  // recurse AST
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
// Unary operators and funcs
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
template<class obj1,class obj2,class obj3> inline
 | 
			
		||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
  ret.Checkerboard() = lhs.Checkerboard();
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto lhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  auto rhs_v = rhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  autoView( rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
  conformable(ret,rhs);
 | 
			
		||||
  conformable(lhs,rhs);
 | 
			
		||||
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
@@ -56,9 +56,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
  ret.Checkerboard() = lhs.Checkerboard();
 | 
			
		||||
  conformable(ret,rhs);
 | 
			
		||||
  conformable(lhs,rhs);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto lhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  auto rhs_v = rhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  autoView( rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    auto lhs_t=lhs_v(ss);
 | 
			
		||||
@@ -73,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
  ret.Checkerboard() = lhs.Checkerboard();
 | 
			
		||||
  conformable(ret,rhs);
 | 
			
		||||
  conformable(lhs,rhs);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto lhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  auto rhs_v = rhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  autoView( rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    auto lhs_t=lhs_v(ss);
 | 
			
		||||
@@ -89,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
  ret.Checkerboard() = lhs.Checkerboard();
 | 
			
		||||
  conformable(ret,rhs);
 | 
			
		||||
  conformable(lhs,rhs);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto lhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  auto rhs_v = rhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  autoView( rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    auto lhs_t=lhs_v(ss);
 | 
			
		||||
@@ -108,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
 | 
			
		||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
  ret.Checkerboard() = lhs.Checkerboard();
 | 
			
		||||
  conformable(lhs,ret);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto lhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    mult(&tmp,&lhs_v(ss),&rhs);
 | 
			
		||||
@@ -121,8 +121,8 @@ template<class obj1,class obj2,class obj3> inline
 | 
			
		||||
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
  ret.Checkerboard() = lhs.Checkerboard();
 | 
			
		||||
  conformable(ret,lhs);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto lhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    auto lhs_t=lhs_v(ss);
 | 
			
		||||
@@ -135,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
 | 
			
		||||
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
  ret.Checkerboard() = lhs.Checkerboard();
 | 
			
		||||
  conformable(ret,lhs);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto lhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    auto lhs_t=lhs_v(ss);
 | 
			
		||||
@@ -148,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
 | 
			
		||||
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
  ret.Checkerboard() = lhs.Checkerboard();
 | 
			
		||||
  conformable(lhs,ret);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto lhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    auto lhs_t=lhs_v(ss);
 | 
			
		||||
@@ -165,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
 | 
			
		||||
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
  ret.Checkerboard() = rhs.Checkerboard();
 | 
			
		||||
  conformable(ret,rhs);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto rhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( rhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    auto rhs_t=rhs_v(ss);
 | 
			
		||||
@@ -179,8 +179,8 @@ template<class obj1,class obj2,class obj3> inline
 | 
			
		||||
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
  ret.Checkerboard() = rhs.Checkerboard();
 | 
			
		||||
  conformable(ret,rhs);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto rhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( rhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    auto rhs_t=rhs_v(ss);
 | 
			
		||||
@@ -193,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
 | 
			
		||||
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
  ret.Checkerboard() = rhs.Checkerboard();
 | 
			
		||||
  conformable(ret,rhs);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto rhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( rhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    auto rhs_t=rhs_v(ss);
 | 
			
		||||
@@ -206,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
 | 
			
		||||
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
  ret.Checkerboard() = rhs.Checkerboard();
 | 
			
		||||
  conformable(ret,rhs);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto rhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( rhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
 | 
			
		||||
    decltype(coalescedRead(obj1())) tmp;
 | 
			
		||||
    auto rhs_t=rhs_v(ss);
 | 
			
		||||
@@ -221,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
 | 
			
		||||
  ret.Checkerboard() = x.Checkerboard();
 | 
			
		||||
  conformable(ret,x);
 | 
			
		||||
  conformable(x,y);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto x_v = x.AcceleratorView(ViewRead);
 | 
			
		||||
  auto y_v = y.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( x_v , x, AcceleratorRead);
 | 
			
		||||
  autoView( y_v , y, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
 | 
			
		||||
    auto tmp = a*x_v(ss)+y_v(ss);
 | 
			
		||||
    coalescedWrite(ret_v[ss],tmp);
 | 
			
		||||
@@ -234,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
 | 
			
		||||
  ret.Checkerboard() = x.Checkerboard();
 | 
			
		||||
  conformable(ret,x);
 | 
			
		||||
  conformable(x,y);
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto x_v = x.AcceleratorView(ViewRead);
 | 
			
		||||
  auto y_v = y.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( x_v , x, AcceleratorRead);
 | 
			
		||||
  autoView( y_v , y, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
 | 
			
		||||
    auto tmp = a*x_v(ss)+b*y_v(ss);
 | 
			
		||||
    coalescedWrite(ret_v[ss],tmp);
 | 
			
		||||
 
 | 
			
		||||
@@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution
 | 
			
		||||
directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
			   /*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#pragma once 
 | 
			
		||||
 | 
			
		||||
#define STREAMING_STORES
 | 
			
		||||
@@ -37,180 +38,6 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
extern int GridCshiftPermuteMap[4][16];
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Base class which can be used by traits to pick up behaviour
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
class LatticeBase {};
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Conformable checks; same instance of Grid required
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
 | 
			
		||||
{
 | 
			
		||||
  assert(lhs == rhs);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Advise the LatticeAccelerator class
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
enum LatticeAcceleratorAdvise {
 | 
			
		||||
  AdviseInfrequentUse = 0x1,    // Advise that the data is used infrequently.  This can
 | 
			
		||||
                                // significantly influence performance of bulk storage.
 | 
			
		||||
  AdviseReadMostly = 0x2,       // Data will mostly be read.  On some architectures
 | 
			
		||||
                                // enables read-only copies of memory to be kept on
 | 
			
		||||
                                // host and device.
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// View Access Mode
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
enum ViewMode {
 | 
			
		||||
  ViewRead = 0x1,
 | 
			
		||||
  ViewWrite = 0x2,
 | 
			
		||||
  ViewReadWrite = 0x3
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Minimal base class containing only data valid to access from accelerator
 | 
			
		||||
// _odata will be a managed pointer in CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Force access to lattice through a view object.
 | 
			
		||||
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
 | 
			
		||||
// strict since host could could in principle direct access through the lattice object
 | 
			
		||||
// Need to decide programming model.
 | 
			
		||||
#define LATTICE_VIEW_STRICT
 | 
			
		||||
template<class vobj> class LatticeAccelerator : public LatticeBase
 | 
			
		||||
{
 | 
			
		||||
protected:
 | 
			
		||||
  GridBase *_grid;
 | 
			
		||||
  int checkerboard;
 | 
			
		||||
  vobj     *_odata;    // A managed pointer
 | 
			
		||||
  uint64_t _odata_size;    
 | 
			
		||||
public:
 | 
			
		||||
  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { }; 
 | 
			
		||||
  accelerator_inline uint64_t oSites(void) const { return _odata_size; };
 | 
			
		||||
  accelerator_inline int  Checkerboard(void) const { return checkerboard; };
 | 
			
		||||
  accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
 | 
			
		||||
  accelerator_inline void Conformable(GridBase * &grid) const
 | 
			
		||||
  { 
 | 
			
		||||
    if (grid) conformable(grid, _grid);
 | 
			
		||||
    else      grid = _grid;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  accelerator_inline void Advise(int advise) {
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifndef __CUDA_ARCH__ // only on host
 | 
			
		||||
    if (advise & AdviseInfrequentUse) {
 | 
			
		||||
      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
 | 
			
		||||
    }
 | 
			
		||||
    if (advise & AdviseReadMostly) {
 | 
			
		||||
      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifndef __CUDA_ARCH__ // only on host
 | 
			
		||||
    int target;
 | 
			
		||||
    cudaGetDevice(&target);
 | 
			
		||||
    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifndef __CUDA_ARCH__ // only on host
 | 
			
		||||
    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// A View class which provides accessor to the data.
 | 
			
		||||
// This will be safe to call from accelerator_for and is trivially copy constructible
 | 
			
		||||
// The copy constructor for this will need to be used by device lambda functions
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> 
 | 
			
		||||
class LatticeView : public LatticeAccelerator<vobj>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Rvalue
 | 
			
		||||
#ifdef __CUDA_ARCH__
 | 
			
		||||
  accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
 | 
			
		||||
#else 
 | 
			
		||||
  accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
 | 
			
		||||
  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
 | 
			
		||||
 | 
			
		||||
  accelerator_inline uint64_t begin(void) const { return 0;};
 | 
			
		||||
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
 | 
			
		||||
  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };
 | 
			
		||||
 | 
			
		||||
  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
 | 
			
		||||
  {
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Lattice expression types used by ET to assemble the AST
 | 
			
		||||
// 
 | 
			
		||||
// Need to be able to detect code paths according to the whether a lattice object or not
 | 
			
		||||
// so introduce some trait type things
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
class LatticeExpressionBase {};
 | 
			
		||||
 | 
			
		||||
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
 | 
			
		||||
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
 | 
			
		||||
 | 
			
		||||
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
 | 
			
		||||
template<class T>                 struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
 | 
			
		||||
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename _T1>                           
 | 
			
		||||
class LatticeUnaryExpression : public  LatticeExpressionBase 
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  typedef typename ViewMap<_T1>::Type T1;
 | 
			
		||||
  Op op;
 | 
			
		||||
  T1 arg1;
 | 
			
		||||
  LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename _T1, typename _T2>              
 | 
			
		||||
class LatticeBinaryExpression : public LatticeExpressionBase 
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  typedef typename ViewMap<_T1>::Type T1;
 | 
			
		||||
  typedef typename ViewMap<_T2>::Type T2;
 | 
			
		||||
  Op op;
 | 
			
		||||
  T1 arg1;
 | 
			
		||||
  T2 arg2;
 | 
			
		||||
  LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename _T1, typename _T2, typename _T3> 
 | 
			
		||||
class LatticeTrinaryExpression : public LatticeExpressionBase 
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  typedef typename ViewMap<_T1>::Type T1;
 | 
			
		||||
  typedef typename ViewMap<_T2>::Type T2;
 | 
			
		||||
  typedef typename ViewMap<_T3>::Type T3;
 | 
			
		||||
  Op op;
 | 
			
		||||
  T1 arg1;
 | 
			
		||||
  T2 arg2;
 | 
			
		||||
  T3 arg3;
 | 
			
		||||
  LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// The real lattice class, with normal copy and assignment semantics.
 | 
			
		||||
// This contains extra (host resident) grid pointer data that may be accessed by host code
 | 
			
		||||
@@ -246,38 +73,33 @@ private:
 | 
			
		||||
      dealloc();
 | 
			
		||||
      
 | 
			
		||||
      this->_odata_size = size;
 | 
			
		||||
      if ( size ) 
 | 
			
		||||
      if ( size )
 | 
			
		||||
	this->_odata      = alloc.allocate(this->_odata_size);
 | 
			
		||||
      else 
 | 
			
		||||
	this->_odata      = nullptr;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  void SetViewMode(ViewMode mode) {
 | 
			
		||||
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
 | 
			
		||||
    accessor.ViewClose();
 | 
			
		||||
  }
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Return a view object that may be dereferenced in site loops.
 | 
			
		||||
  // The view is trivially copy constructible and may be copied to an accelerator device
 | 
			
		||||
  // in device lambdas
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
 | 
			
		||||
  {                                   //                     and HostView        for thread_for
 | 
			
		||||
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
 | 
			
		||||
 | 
			
		||||
  LatticeView<vobj> View (ViewMode mode) const 
 | 
			
		||||
  {
 | 
			
		||||
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
 | 
			
		||||
    return accessor;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const 
 | 
			
		||||
  {
 | 
			
		||||
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
 | 
			
		||||
    accessor.AcceleratorPrefetch(mode);
 | 
			
		||||
    return accessor;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  LatticeView<vobj> HostView(int mode = ViewReadWrite) const 
 | 
			
		||||
  {
 | 
			
		||||
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
 | 
			
		||||
    accessor.HostPrefetch(mode);
 | 
			
		||||
    return accessor;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  ~Lattice() { 
 | 
			
		||||
    if ( this->_odata_size ) {
 | 
			
		||||
      dealloc();
 | 
			
		||||
@@ -297,12 +119,16 @@ public:
 | 
			
		||||
    CBFromExpression(cb,expr);
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    this->checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    auto me  = AcceleratorView(ViewWrite);
 | 
			
		||||
    
 | 
			
		||||
    auto exprCopy = expr;
 | 
			
		||||
    ExpressionViewOpen(exprCopy);
 | 
			
		||||
    auto me  = View(AcceleratorWriteDiscard);
 | 
			
		||||
    accelerator_for(ss,me.size(),1,{
 | 
			
		||||
      auto tmp = eval(ss,expr);
 | 
			
		||||
      auto tmp = eval(ss,exprCopy);
 | 
			
		||||
      vstream(me[ss],tmp);
 | 
			
		||||
    });
 | 
			
		||||
    me.ViewClose();
 | 
			
		||||
    ExpressionViewClose(exprCopy);
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
 | 
			
		||||
@@ -317,11 +143,15 @@ public:
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    this->checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    auto me  = AcceleratorView(ViewWrite);
 | 
			
		||||
    auto exprCopy = expr;
 | 
			
		||||
    ExpressionViewOpen(exprCopy);
 | 
			
		||||
    auto me  = View(AcceleratorWriteDiscard);
 | 
			
		||||
    accelerator_for(ss,me.size(),1,{
 | 
			
		||||
      auto tmp = eval(ss,expr);
 | 
			
		||||
      auto tmp = eval(ss,exprCopy);
 | 
			
		||||
      vstream(me[ss],tmp);
 | 
			
		||||
    });
 | 
			
		||||
    me.ViewClose();
 | 
			
		||||
    ExpressionViewClose(exprCopy);
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
 | 
			
		||||
@@ -335,11 +165,15 @@ public:
 | 
			
		||||
    CBFromExpression(cb,expr);
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    this->checkerboard=cb;
 | 
			
		||||
    auto me  = AcceleratorView(ViewWrite);
 | 
			
		||||
    auto exprCopy = expr;
 | 
			
		||||
    ExpressionViewOpen(exprCopy);
 | 
			
		||||
    auto me  = View(AcceleratorWriteDiscard);
 | 
			
		||||
    accelerator_for(ss,me.size(),1,{
 | 
			
		||||
      auto tmp = eval(ss,expr);
 | 
			
		||||
      auto tmp = eval(ss,exprCopy);
 | 
			
		||||
      vstream(me[ss],tmp);
 | 
			
		||||
    });
 | 
			
		||||
    me.ViewClose();
 | 
			
		||||
    ExpressionViewClose(exprCopy);
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  //GridFromExpression is tricky to do
 | 
			
		||||
@@ -390,10 +224,11 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
 | 
			
		||||
    auto me  = View();
 | 
			
		||||
    auto me  = View(CpuWrite);
 | 
			
		||||
    thread_for(ss,me.size(),{
 | 
			
		||||
      me[ss] = r;
 | 
			
		||||
	me[ss]= r;
 | 
			
		||||
    });
 | 
			
		||||
    me.ViewClose();
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -403,11 +238,12 @@ public:
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
  // user defined constructor
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
  Lattice(GridBase *grid) { 
 | 
			
		||||
  Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { 
 | 
			
		||||
    this->_grid = grid;
 | 
			
		||||
    resize(this->_grid->oSites());
 | 
			
		||||
    assert((((uint64_t)&this->_odata[0])&0xF) ==0);
 | 
			
		||||
    this->checkerboard=0;
 | 
			
		||||
    SetViewMode(mode);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  //  virtual ~Lattice(void) = default;
 | 
			
		||||
@@ -445,11 +281,12 @@ public:
 | 
			
		||||
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
 | 
			
		||||
    conformable(*this,r);
 | 
			
		||||
    this->checkerboard = r.Checkerboard();
 | 
			
		||||
    auto me =   AcceleratorView(ViewWrite);
 | 
			
		||||
    auto him= r.AcceleratorView(ViewRead);
 | 
			
		||||
    auto me =   View(AcceleratorWriteDiscard);
 | 
			
		||||
    auto him= r.View(AcceleratorRead);
 | 
			
		||||
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 | 
			
		||||
      coalescedWrite(me[ss],him(ss));
 | 
			
		||||
    });
 | 
			
		||||
    me.ViewClose();    him.ViewClose();
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -459,11 +296,12 @@ public:
 | 
			
		||||
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
 | 
			
		||||
    this->checkerboard = r.Checkerboard();
 | 
			
		||||
    conformable(*this,r);
 | 
			
		||||
    auto me =   AcceleratorView(ViewWrite);
 | 
			
		||||
    auto him= r.AcceleratorView(ViewRead);
 | 
			
		||||
    auto me =   View(AcceleratorWriteDiscard);
 | 
			
		||||
    auto him= r.View(AcceleratorRead);
 | 
			
		||||
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 | 
			
		||||
      coalescedWrite(me[ss],him(ss));
 | 
			
		||||
    });
 | 
			
		||||
    me.ViewClose();    him.ViewClose();
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -51,34 +51,18 @@ template<class VField, class Matrix>
 | 
			
		||||
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) 
 | 
			
		||||
{
 | 
			
		||||
  typedef decltype(basis[0]) Field;
 | 
			
		||||
  typedef decltype(basis[0].View()) View;
 | 
			
		||||
  auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
 | 
			
		||||
  Vector<View> basis_v(basis.size(),tmp_v);
 | 
			
		||||
  typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
 | 
			
		||||
  typedef decltype(basis[0].View(AcceleratorRead)) View;
 | 
			
		||||
 | 
			
		||||
  Vector<View> basis_v; basis_v.reserve(basis.size());
 | 
			
		||||
  GridBase* grid = basis[0].Grid();
 | 
			
		||||
      
 | 
			
		||||
  for(int k=0;k<basis.size();k++){
 | 
			
		||||
    basis_v[k] = basis[k].AcceleratorView(ViewReadWrite);
 | 
			
		||||
    basis_v.push_back(basis[k].View(AcceleratorWrite));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
  thread_region
 | 
			
		||||
  {
 | 
			
		||||
    std::vector < vobj > B(Nm); // Thread private
 | 
			
		||||
    thread_for_in_region(ss, grid->oSites(),{
 | 
			
		||||
	for(int j=j0; j<j1; ++j) B[j]=0.;
 | 
			
		||||
      
 | 
			
		||||
	for(int j=j0; j<j1; ++j){
 | 
			
		||||
	  for(int k=k0; k<k1; ++k){
 | 
			
		||||
	    B[j] +=Qt(j,k) * basis_v[k][ss];
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
	for(int j=j0; j<j1; ++j){
 | 
			
		||||
	  basis_v[j][ss] = B[j];
 | 
			
		||||
	}
 | 
			
		||||
      });
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
  View *basis_vp = &basis_v[0];
 | 
			
		||||
 | 
			
		||||
  int nrot = j1-j0;
 | 
			
		||||
  if (!nrot) // edge case not handled gracefully by Cuda
 | 
			
		||||
    return;
 | 
			
		||||
@@ -86,6 +70,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 | 
			
		||||
  uint64_t oSites   =grid->oSites();
 | 
			
		||||
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
 | 
			
		||||
 | 
			
		||||
  typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
 | 
			
		||||
 | 
			
		||||
  Vector <vobj> Bt(siteBlock * nrot); 
 | 
			
		||||
  auto Bp=&Bt[0];
 | 
			
		||||
 | 
			
		||||
@@ -96,7 +82,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 | 
			
		||||
      int j = i/Nm;
 | 
			
		||||
      int k = i%Nm;
 | 
			
		||||
      Qt_p[i]=Qt(j,k);
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  // Block the loop to keep storage footprint down
 | 
			
		||||
  for(uint64_t s=0;s<oSites;s+=siteBlock){
 | 
			
		||||
@@ -132,27 +118,30 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 | 
			
		||||
	coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
 | 
			
		||||
      });
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Extract a single rotated vector
 | 
			
		||||
template<class Field>
 | 
			
		||||
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 | 
			
		||||
{
 | 
			
		||||
  typedef decltype(basis[0].AcceleratorView()) View;
 | 
			
		||||
  typedef decltype(basis[0].View(AcceleratorRead)) View;
 | 
			
		||||
  typedef typename Field::vector_object vobj;
 | 
			
		||||
  GridBase* grid = basis[0].Grid();
 | 
			
		||||
 | 
			
		||||
  result.Checkerboard() = basis[0].Checkerboard();
 | 
			
		||||
  auto result_v=result.AcceleratorView(ViewWrite);
 | 
			
		||||
  Vector<View> basis_v(basis.size(),result_v);
 | 
			
		||||
 | 
			
		||||
  Vector<View> basis_v; basis_v.reserve(basis.size());
 | 
			
		||||
  for(int k=0;k<basis.size();k++){
 | 
			
		||||
    basis_v[k] = basis[k].AcceleratorView(ViewRead);
 | 
			
		||||
    basis_v.push_back(basis[k].View(AcceleratorRead));
 | 
			
		||||
  }
 | 
			
		||||
  vobj zz=Zero();
 | 
			
		||||
  Vector<double> Qt_jv(Nm);
 | 
			
		||||
  double * Qt_j = & Qt_jv[0];
 | 
			
		||||
  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
 | 
			
		||||
 | 
			
		||||
  autoView(result_v,result,AcceleratorWrite);
 | 
			
		||||
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
 | 
			
		||||
    auto B=coalescedRead(zz);
 | 
			
		||||
    for(int k=k0; k<k1; ++k){
 | 
			
		||||
@@ -160,6 +149,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
 | 
			
		||||
    }
 | 
			
		||||
    coalescedWrite(result_v[ss], B);
 | 
			
		||||
  });
 | 
			
		||||
  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Field>
 | 
			
		||||
 
 | 
			
		||||
@@ -78,9 +78,9 @@ template<class vfunctor,class lobj,class robj>
 | 
			
		||||
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
 | 
			
		||||
{
 | 
			
		||||
  Lattice<vPredicate> ret(rhs.Grid());
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  autoView( lhs_v, lhs, CpuRead);
 | 
			
		||||
  autoView( rhs_v, rhs, CpuRead);
 | 
			
		||||
  autoView( ret_v, ret, CpuWrite);
 | 
			
		||||
  thread_for( ss, rhs_v.size(), {
 | 
			
		||||
      ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
 | 
			
		||||
  });
 | 
			
		||||
@@ -93,8 +93,8 @@ template<class vfunctor,class lobj,class robj>
 | 
			
		||||
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
 | 
			
		||||
{
 | 
			
		||||
  Lattice<vPredicate> ret(lhs.Grid());
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  autoView( lhs_v, lhs, CpuRead);
 | 
			
		||||
  autoView( ret_v, ret, CpuWrite);
 | 
			
		||||
  thread_for( ss, lhs_v.size(), {
 | 
			
		||||
    ret_v[ss]=op(lhs_v[ss],rhs);
 | 
			
		||||
  });
 | 
			
		||||
@@ -107,8 +107,8 @@ template<class vfunctor,class lobj,class robj>
 | 
			
		||||
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
 | 
			
		||||
{
 | 
			
		||||
  Lattice<vPredicate> ret(rhs.Grid());
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  autoView( rhs_v, rhs, CpuRead);
 | 
			
		||||
  autoView( ret_v, ret, CpuWrite);
 | 
			
		||||
  thread_for( ss, rhs_v.size(), {
 | 
			
		||||
    ret_v[ss]=op(lhs,rhs_v[ss]);
 | 
			
		||||
  });
 | 
			
		||||
 
 | 
			
		||||
@@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
 | 
			
		||||
  GridBase *grid = l.Grid();
 | 
			
		||||
  int Nsimd = grid->iSites();
 | 
			
		||||
 | 
			
		||||
  auto l_v = l.View();
 | 
			
		||||
  autoView(l_v, l, CpuWrite);
 | 
			
		||||
  thread_for( o, grid->oSites(), {
 | 
			
		||||
    vector_type vI;
 | 
			
		||||
    Coordinate gcoor;
 | 
			
		||||
@@ -51,23 +51,5 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
 | 
			
		||||
  });
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// LatticeCoordinate();
 | 
			
		||||
// FIXME for debug; deprecate this; made obscelete by 
 | 
			
		||||
template<class vobj> void lex_sites(Lattice<vobj> &l){
 | 
			
		||||
  auto l_v = l.View();
 | 
			
		||||
  Real *v_ptr = (Real *)&l_v[0];
 | 
			
		||||
  size_t o_len = l.Grid()->oSites();
 | 
			
		||||
  size_t v_len = sizeof(vobj)/sizeof(vRealF);
 | 
			
		||||
  size_t vec_len = vRealF::Nsimd();
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<o_len;i++){
 | 
			
		||||
    for(int j=0;j<v_len;j++){
 | 
			
		||||
      for(int vv=0;vv<vec_len;vv+=2){
 | 
			
		||||
	v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
 | 
			
		||||
	v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
 | 
			
		||||
      }
 | 
			
		||||
    }}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -43,8 +43,8 @@ template<class vobj>
 | 
			
		||||
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
 | 
			
		||||
{
 | 
			
		||||
  Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  autoView( rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
 | 
			
		||||
    coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
 | 
			
		||||
  });
 | 
			
		||||
@@ -56,9 +56,9 @@ template<class vobj>
 | 
			
		||||
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
 | 
			
		||||
{
 | 
			
		||||
  Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  autoView( rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
 | 
			
		||||
    coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
 | 
			
		||||
  });
 | 
			
		||||
@@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
 | 
			
		||||
  typedef decltype(coalescedRead(ll())) sll;
 | 
			
		||||
  typedef decltype(coalescedRead(rr())) srr;
 | 
			
		||||
  Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  autoView( rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  accelerator_for(ss,rhs_v.size(),1,{
 | 
			
		||||
    // FIXME had issues with scalar version of outer 
 | 
			
		||||
    // Use vector [] operator and don't read coalesce this loop
 | 
			
		||||
 
 | 
			
		||||
@@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
 | 
			
		||||
  int block =FullGrid->_slice_block [Orthog];
 | 
			
		||||
  int nblock=FullGrid->_slice_nblock[Orthog];
 | 
			
		||||
  int ostride=FullGrid->_ostride[Orthog];
 | 
			
		||||
  auto X_v = X.View();
 | 
			
		||||
  auto Y_v = Y.View();
 | 
			
		||||
  auto R_v = R.View();
 | 
			
		||||
  autoView( X_v , X, CpuRead);
 | 
			
		||||
  autoView( Y_v , Y, CpuRead);
 | 
			
		||||
  autoView( R_v , R, CpuWrite);
 | 
			
		||||
  thread_region
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<vobj> s_x(Nblock);
 | 
			
		||||
@@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
 | 
			
		||||
  int nblock=FullGrid->_slice_nblock[Orthog];
 | 
			
		||||
  int ostride=FullGrid->_ostride[Orthog];
 | 
			
		||||
 | 
			
		||||
  auto X_v = X.View();
 | 
			
		||||
  auto R_v = R.View();
 | 
			
		||||
  autoView( X_v , X, CpuRead);
 | 
			
		||||
  autoView( R_v , R, CpuWrite);
 | 
			
		||||
 | 
			
		||||
  thread_region
 | 
			
		||||
  {
 | 
			
		||||
@@ -156,8 +156,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
 | 
			
		||||
  int ostride=FullGrid->_ostride[Orthog];
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::vector_typeD vector_typeD;
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  autoView( lhs_v , lhs, CpuRead);
 | 
			
		||||
  autoView( rhs_v , rhs, CpuRead);
 | 
			
		||||
  thread_region {
 | 
			
		||||
    std::vector<vobj> Left(Nblock);
 | 
			
		||||
    std::vector<vobj> Right(Nblock);
 | 
			
		||||
 
 | 
			
		||||
@@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
 | 
			
		||||
{
 | 
			
		||||
  Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
 | 
			
		||||
  ret.Checkerboard()=lhs.Checkerboard();
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  thread_for( ss, lhs_v.size(), {
 | 
			
		||||
  autoView( ret_v, ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v, lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for( ss, lhs_v.size(), 1, {
 | 
			
		||||
    ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
 | 
			
		||||
  });
 | 
			
		||||
  return ret;
 | 
			
		||||
@@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
 | 
			
		||||
{
 | 
			
		||||
  Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
 | 
			
		||||
  ret.Checkerboard()=lhs.Checkerboard();
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  thread_for( ss, lhs_v.size(), {
 | 
			
		||||
  autoView( ret_v, ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v, lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for( ss, lhs_v.size(), 1, {
 | 
			
		||||
    ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
 | 
			
		||||
  });
 | 
			
		||||
  return ret;
 | 
			
		||||
@@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
 | 
			
		||||
template<int Index,class vobj>  
 | 
			
		||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
 | 
			
		||||
{
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  thread_for( ss, lhs_v.size(), {
 | 
			
		||||
  autoView( rhs_v, rhs, AcceleratorRead);
 | 
			
		||||
  autoView( lhs_v, lhs, AcceleratorWrite);
 | 
			
		||||
  accelerator_for( ss, lhs_v.size(), 1, {
 | 
			
		||||
    pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
template<int Index,class vobj> 
 | 
			
		||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
 | 
			
		||||
{
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  thread_for( ss, lhs_v.size(), {
 | 
			
		||||
  autoView( rhs_v, rhs, AcceleratorRead);
 | 
			
		||||
  autoView( lhs_v, lhs, AcceleratorWrite);
 | 
			
		||||
  accelerator_for( ss, lhs_v.size(), 1, {
 | 
			
		||||
    pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
@@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 | 
			
		||||
 | 
			
		||||
  // extract-modify-merge cycle is easiest way and this is not perf critical
 | 
			
		||||
  ExtractBuffer<sobj> buf(Nsimd);
 | 
			
		||||
  auto l_v = l.View();
 | 
			
		||||
  autoView( l_v , l, CpuWrite);
 | 
			
		||||
  if ( rank == grid->ThisRank() ) {
 | 
			
		||||
    extract(l_v[odx],buf);
 | 
			
		||||
    buf[idx] = s;
 | 
			
		||||
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
 | 
			
		||||
  grid->GlobalCoorToRankIndex(rank,odx,idx,site);
 | 
			
		||||
 | 
			
		||||
  ExtractBuffer<sobj> buf(Nsimd);
 | 
			
		||||
  auto l_v = l.View();
 | 
			
		||||
  autoView( l_v , l, CpuWrite);
 | 
			
		||||
  extract(l_v[odx],buf);
 | 
			
		||||
 | 
			
		||||
  s = buf[idx];
 | 
			
		||||
@@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
 | 
			
		||||
  return;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////
 | 
			
		||||
// Peek a scalar object from the SIMD array
 | 
			
		||||
//////////////////////////////////////////////////////////
 | 
			
		||||
// Must be CPU read view
 | 
			
		||||
template<class vobj,class sobj>
 | 
			
		||||
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
 | 
			
		||||
        
 | 
			
		||||
  GridBase *grid = l.Grid();
 | 
			
		||||
 | 
			
		||||
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
 | 
			
		||||
{
 | 
			
		||||
  GridBase *grid = l.getGrid();
 | 
			
		||||
  assert(l.mode==CpuRead);
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
  int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
 | 
			
		||||
  assert( l.Checkerboard()== grid->CheckerBoard(site));
 | 
			
		||||
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
 | 
			
		||||
 | 
			
		||||
  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
@@ -173,8 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
 | 
			
		||||
  idx= grid->iIndex(site);
 | 
			
		||||
  odx= grid->oIndex(site);
 | 
			
		||||
  
 | 
			
		||||
  auto l_v = l.View();
 | 
			
		||||
  scalar_type * vp = (scalar_type *)&l_v[odx];
 | 
			
		||||
  scalar_type * vp = (scalar_type *)&l[odx];
 | 
			
		||||
  scalar_type * pt = (scalar_type *)&s;
 | 
			
		||||
      
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
@@ -183,18 +182,19 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
 | 
			
		||||
      
 | 
			
		||||
  return;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Must be CPU write view
 | 
			
		||||
template<class vobj,class sobj>
 | 
			
		||||
inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
 | 
			
		||||
 | 
			
		||||
  GridBase *grid=l.Grid();
 | 
			
		||||
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
 | 
			
		||||
{
 | 
			
		||||
  GridBase *grid=l.getGrid();
 | 
			
		||||
  assert(l.mode==CpuWrite);
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
  int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
 | 
			
		||||
  assert( l.Checkerboard()== grid->CheckerBoard(site));
 | 
			
		||||
  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
 | 
			
		||||
 | 
			
		||||
  static const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
@@ -202,13 +202,11 @@ inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
 | 
			
		||||
  idx= grid->iIndex(site);
 | 
			
		||||
  odx= grid->oIndex(site);
 | 
			
		||||
 | 
			
		||||
  auto l_v = l.View();
 | 
			
		||||
  scalar_type * vp = (scalar_type *)&l_v[odx];
 | 
			
		||||
  scalar_type * vp = (scalar_type *)&l[odx];
 | 
			
		||||
  scalar_type * pt = (scalar_type *)&s;
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    vp[idx+w*Nsimd] = pt[w];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -40,9 +40,11 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
 | 
			
		||||
  Lattice<vobj> ret(lhs.Grid());
 | 
			
		||||
 | 
			
		||||
  autoView( lhs_v, lhs, AcceleratorRead);
 | 
			
		||||
  autoView( ret_v, ret, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  ret.Checkerboard()=lhs.Checkerboard();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
 | 
			
		||||
    coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
 | 
			
		||||
  });
 | 
			
		||||
@@ -51,9 +53,11 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
 | 
			
		||||
  Lattice<vobj> ret(lhs.Grid());
 | 
			
		||||
 | 
			
		||||
  autoView( lhs_v, lhs, AcceleratorRead);
 | 
			
		||||
  autoView( ret_v, ret, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  ret.Checkerboard() = lhs.Checkerboard();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
 | 
			
		||||
    coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
 | 
			
		||||
  });
 | 
			
		||||
 
 | 
			
		||||
@@ -25,7 +25,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
 | 
			
		||||
#include <Grid/Grid_Eigen_Dense.h>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
 | 
			
		||||
#include <Grid/lattice/Lattice_reduction_gpu.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@@ -39,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object  sobj;
 | 
			
		||||
 | 
			
		||||
  const int Nsimd = vobj::Nsimd();
 | 
			
		||||
  //  const int Nsimd = vobj::Nsimd();
 | 
			
		||||
  const int nthread = GridThread::GetThreads();
 | 
			
		||||
 | 
			
		||||
  Vector<sobj> sumarray(nthread);
 | 
			
		||||
  for(int i=0;i<nthread;i++){
 | 
			
		||||
    sumarray[i]=Zero();
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  thread_for(thr,nthread, {
 | 
			
		||||
    int nwork, mywork, myoff;
 | 
			
		||||
    nwork = osites;
 | 
			
		||||
    GridThread::GetWork(nwork,thr,mywork,myoff);
 | 
			
		||||
    vobj vvsum=Zero();
 | 
			
		||||
    for(int ss=myoff;ss<mywork+myoff; ss++){
 | 
			
		||||
      vvsum = vvsum + arg[ss];
 | 
			
		||||
    }
 | 
			
		||||
    sumarray[thr]=Reduce(vvsum);
 | 
			
		||||
  });
 | 
			
		||||
  
 | 
			
		||||
  sobj ssum=Zero();  // sum across threads
 | 
			
		||||
  for(int i=0;i<nthread;i++){
 | 
			
		||||
    ssum = ssum+sumarray[i];
 | 
			
		||||
  } 
 | 
			
		||||
  return ssum;
 | 
			
		||||
}
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_objectD  sobj;
 | 
			
		||||
 | 
			
		||||
  const int nthread = GridThread::GetThreads();
 | 
			
		||||
 | 
			
		||||
  Vector<sobj> sumarray(nthread);
 | 
			
		||||
@@ -63,23 +92,43 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
 | 
			
		||||
    ssum = ssum+sumarray[i];
 | 
			
		||||
  } 
 | 
			
		||||
  
 | 
			
		||||
  return ssum;
 | 
			
		||||
  typedef typename vobj::scalar_object ssobj;
 | 
			
		||||
  ssobj ret = ssum;
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
 | 
			
		||||
  return sum_gpu(arg,osites);
 | 
			
		||||
#else
 | 
			
		||||
  return sum_cpu(arg,osites);
 | 
			
		||||
#endif  
 | 
			
		||||
}
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
 | 
			
		||||
  return sumD_gpu(arg,osites);
 | 
			
		||||
#else
 | 
			
		||||
  return sumD_cpu(arg,osites);
 | 
			
		||||
#endif  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 | 
			
		||||
{
 | 
			
		||||
  auto arg_v = arg.View();
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
 | 
			
		||||
  autoView( arg_v, arg, AcceleratorRead);
 | 
			
		||||
  Integer osites = arg.Grid()->oSites();
 | 
			
		||||
  auto ssum= sum(&arg_v[0],osites);
 | 
			
		||||
  auto ssum= sum_gpu(&arg_v[0],osites);
 | 
			
		||||
#else
 | 
			
		||||
  autoView(arg_v, arg, CpuRead);
 | 
			
		||||
  Integer osites = arg.Grid()->oSites();
 | 
			
		||||
  auto ssum= sum_cpu(&arg_v[0],osites);
 | 
			
		||||
#endif  
 | 
			
		||||
  arg.Grid()->GlobalSum(ssum);
 | 
			
		||||
  return ssum;
 | 
			
		||||
}
 | 
			
		||||
@@ -101,43 +150,30 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
 | 
			
		||||
  ComplexD  nrm;
 | 
			
		||||
  
 | 
			
		||||
  GridBase *grid = left.Grid();
 | 
			
		||||
  
 | 
			
		||||
  // Might make all code paths go this way.
 | 
			
		||||
  auto left_v = left.AcceleratorView(ViewRead);
 | 
			
		||||
  auto right_v=right.AcceleratorView(ViewRead);
 | 
			
		||||
 | 
			
		||||
  const uint64_t nsimd = grid->Nsimd();
 | 
			
		||||
  const uint64_t sites = grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
  // GPU - SIMT lane compliance...
 | 
			
		||||
  typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
 | 
			
		||||
  // Might make all code paths go this way.
 | 
			
		||||
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
 | 
			
		||||
  Vector<inner_t> inner_tmp(sites);
 | 
			
		||||
  auto inner_tmp_v = &inner_tmp[0];
 | 
			
		||||
  
 | 
			
		||||
    
 | 
			
		||||
  {
 | 
			
		||||
    autoView( left_v , left, AcceleratorRead);
 | 
			
		||||
    autoView( right_v,right, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
  accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
      auto x_l = left_v(ss);
 | 
			
		||||
      auto y_l = right_v(ss);
 | 
			
		||||
      coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
 | 
			
		||||
  })
 | 
			
		||||
    // GPU - SIMT lane compliance...
 | 
			
		||||
    accelerator_for( ss, sites, 1,{
 | 
			
		||||
	auto x_l = left_v[ss];
 | 
			
		||||
	auto y_l = right_v[ss];
 | 
			
		||||
	inner_tmp_v[ss]=innerProductD(x_l,y_l);
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // This is in single precision and fails some tests
 | 
			
		||||
  // Need a sumD that sums in double
 | 
			
		||||
  nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));  
 | 
			
		||||
#else
 | 
			
		||||
  // CPU 
 | 
			
		||||
  typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
 | 
			
		||||
  Vector<inner_t> inner_tmp(sites);
 | 
			
		||||
  auto inner_tmp_v = &inner_tmp[0];
 | 
			
		||||
  
 | 
			
		||||
  accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
      auto x_l = left_v[ss];
 | 
			
		||||
      auto y_l = right_v[ss];
 | 
			
		||||
      inner_tmp_v[ss]=innerProductD(x_l,y_l);
 | 
			
		||||
  })
 | 
			
		||||
  nrm = TensorRemove(sum(inner_tmp_v,sites));
 | 
			
		||||
#endif
 | 
			
		||||
  auto anrm = sum(inner_tmp_v,sites);  
 | 
			
		||||
  nrm = anrm;
 | 
			
		||||
  return nrm;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -175,40 +211,24 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
 | 
			
		||||
  
 | 
			
		||||
  GridBase *grid = x.Grid();
 | 
			
		||||
 | 
			
		||||
  auto x_v=x.AcceleratorView(ViewRead);
 | 
			
		||||
  auto y_v=y.AcceleratorView(ViewRead);
 | 
			
		||||
  auto z_v=z.AcceleratorView(ViewWrite);
 | 
			
		||||
 | 
			
		||||
  const uint64_t nsimd = grid->Nsimd();
 | 
			
		||||
  const uint64_t sites = grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
  // GPU
 | 
			
		||||
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
 | 
			
		||||
  Vector<inner_t> inner_tmp(sites);
 | 
			
		||||
  auto inner_tmp_v = &inner_tmp[0];
 | 
			
		||||
  autoView( x_v, x, AcceleratorRead);
 | 
			
		||||
  autoView( y_v, y, AcceleratorRead);
 | 
			
		||||
  autoView( z_v, z, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
      auto tmp = a*x_v(ss)+b*y_v(ss);
 | 
			
		||||
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
 | 
			
		||||
      coalescedWrite(z_v[ss],tmp);
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
 | 
			
		||||
#else
 | 
			
		||||
  // CPU 
 | 
			
		||||
  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
 | 
			
		||||
  Vector<inner_t> inner_tmp(sites);
 | 
			
		||||
  auto inner_tmp_v = &inner_tmp[0];
 | 
			
		||||
  
 | 
			
		||||
  accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
      auto tmp = a*x_v(ss)+b*y_v(ss);
 | 
			
		||||
 | 
			
		||||
  accelerator_for( ss, sites, 1,{
 | 
			
		||||
      auto tmp = a*x_v[ss]+b*y_v[ss];
 | 
			
		||||
      inner_tmp_v[ss]=innerProductD(tmp,tmp);
 | 
			
		||||
      z_v[ss]=tmp;
 | 
			
		||||
  });
 | 
			
		||||
  // Already promoted to double
 | 
			
		||||
  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
 | 
			
		||||
#endif
 | 
			
		||||
  grid->GlobalSum(nrm);
 | 
			
		||||
  return nrm; 
 | 
			
		||||
}
 | 
			
		||||
@@ -224,47 +244,29 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
 | 
			
		||||
 | 
			
		||||
  GridBase *grid = left.Grid();
 | 
			
		||||
 | 
			
		||||
  auto left_v=left.AcceleratorView(ViewRead);
 | 
			
		||||
  auto right_v=right.AcceleratorView(ViewRead);
 | 
			
		||||
 | 
			
		||||
  const uint64_t nsimd = grid->Nsimd();
 | 
			
		||||
  const uint64_t sites = grid->oSites();
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
  // GPU
 | 
			
		||||
  typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
 | 
			
		||||
  typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t;
 | 
			
		||||
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
 | 
			
		||||
  typedef decltype(innerProductD(vobj(),vobj())) norm_t;
 | 
			
		||||
  Vector<inner_t> inner_tmp(sites);
 | 
			
		||||
  Vector<norm_t> norm_tmp(sites);
 | 
			
		||||
  Vector<norm_t>  norm_tmp(sites);
 | 
			
		||||
  auto inner_tmp_v = &inner_tmp[0];
 | 
			
		||||
  auto norm_tmp_v = &norm_tmp[0];
 | 
			
		||||
  {
 | 
			
		||||
    autoView(left_v,left, AcceleratorRead);
 | 
			
		||||
    autoView(right_v,right,AcceleratorRead);
 | 
			
		||||
    accelerator_for( ss, sites, 1,{
 | 
			
		||||
	auto left_tmp = left_v[ss];
 | 
			
		||||
	inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
 | 
			
		||||
        norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
 | 
			
		||||
      });
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
      auto left_tmp = left_v(ss);
 | 
			
		||||
      coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
 | 
			
		||||
      coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
 | 
			
		||||
  tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
 | 
			
		||||
#else
 | 
			
		||||
  // CPU
 | 
			
		||||
  typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
 | 
			
		||||
  typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
 | 
			
		||||
  Vector<inner_t> inner_tmp(sites);
 | 
			
		||||
  Vector<norm_t> norm_tmp(sites);
 | 
			
		||||
  auto inner_tmp_v = &inner_tmp[0];
 | 
			
		||||
  auto norm_tmp_v = &norm_tmp[0];
 | 
			
		||||
 | 
			
		||||
  accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
      auto left_tmp = left_v(ss);
 | 
			
		||||
      inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
 | 
			
		||||
      norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp);
 | 
			
		||||
  });
 | 
			
		||||
  // Already promoted to double
 | 
			
		||||
  tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
 | 
			
		||||
  tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
 | 
			
		||||
  ip = tmp[0];
 | 
			
		||||
  nrm = real(tmp[1]);
 | 
			
		||||
@@ -335,7 +337,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 | 
			
		||||
 | 
			
		||||
  // sum over reduced dimension planes, breaking out orthog dir
 | 
			
		||||
  // Parallel over orthog direction
 | 
			
		||||
  auto Data_v=Data.View();
 | 
			
		||||
  autoView( Data_v, Data, CpuRead);
 | 
			
		||||
  thread_for( r,rd, {
 | 
			
		||||
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
@@ -413,8 +415,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
 | 
			
		||||
  int e2=    grid->_slice_block [orthogdim];
 | 
			
		||||
  int stride=grid->_slice_stride[orthogdim];
 | 
			
		||||
 | 
			
		||||
  auto lhv=lhs.View();
 | 
			
		||||
  auto rhv=rhs.View();
 | 
			
		||||
  autoView( lhv, lhs, CpuRead);
 | 
			
		||||
  autoView( rhv, rhs, CpuRead);
 | 
			
		||||
  thread_for( r,rd,{
 | 
			
		||||
 | 
			
		||||
    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
 | 
			
		||||
@@ -521,14 +523,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
 | 
			
		||||
 | 
			
		||||
    tensor_reduced at; at=av;
 | 
			
		||||
 | 
			
		||||
    auto Rv=R.View();
 | 
			
		||||
    auto Xv=X.View();
 | 
			
		||||
    auto Yv=Y.View();
 | 
			
		||||
    thread_for_collapse(2, n, e1, {
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
    autoView( Rv, R, CpuWrite);
 | 
			
		||||
    autoView( Xv, X, CpuRead);
 | 
			
		||||
    autoView( Yv, Y, CpuRead);
 | 
			
		||||
    thread_for2d( n, e1, b,e2, {
 | 
			
		||||
	int ss= so+n*stride+b;
 | 
			
		||||
	Rv[ss] = at*Xv[ss]+Yv[ss];
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
@@ -581,9 +581,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
 | 
			
		||||
  int nblock=FullGrid->_slice_nblock[Orthog];
 | 
			
		||||
  int ostride=FullGrid->_ostride[Orthog];
 | 
			
		||||
 | 
			
		||||
  auto X_v=X.View();
 | 
			
		||||
  auto Y_v=Y.View();
 | 
			
		||||
  auto R_v=R.View();
 | 
			
		||||
  autoView( X_v, X, CpuRead);
 | 
			
		||||
  autoView( Y_v, Y, CpuRead);
 | 
			
		||||
  autoView( R_v, R, CpuWrite);
 | 
			
		||||
  thread_region
 | 
			
		||||
  {
 | 
			
		||||
    Vector<vobj> s_x(Nblock);
 | 
			
		||||
@@ -628,13 +628,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
 | 
			
		||||
  //  int nl=1;
 | 
			
		||||
 | 
			
		||||
  //FIXME package in a convenient iterator
 | 
			
		||||
  // thread_for2d_in_region
 | 
			
		||||
  //Should loop over a plane orthogonal to direction "Orthog"
 | 
			
		||||
  int stride=FullGrid->_slice_stride[Orthog];
 | 
			
		||||
  int block =FullGrid->_slice_block [Orthog];
 | 
			
		||||
  int nblock=FullGrid->_slice_nblock[Orthog];
 | 
			
		||||
  int ostride=FullGrid->_ostride[Orthog];
 | 
			
		||||
  auto R_v = R.View();
 | 
			
		||||
  auto X_v = X.View();
 | 
			
		||||
  autoView( R_v, R, CpuWrite);
 | 
			
		||||
  autoView( X_v, X, CpuRead);
 | 
			
		||||
  thread_region
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<vobj> s_x(Nblock);
 | 
			
		||||
@@ -692,8 +693,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::vector_typeD vector_typeD;
 | 
			
		||||
 | 
			
		||||
  auto lhs_v=lhs.View();
 | 
			
		||||
  auto rhs_v=rhs.View();
 | 
			
		||||
  autoView( lhs_v, lhs, CpuRead);
 | 
			
		||||
  autoView( rhs_v, rhs, CpuRead);
 | 
			
		||||
  thread_region
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<vobj> Left(Nblock);
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +1,13 @@
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
#define WARP_SIZE 32
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
extern hipDeviceProp_t *gpu_props;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
extern cudaDeviceProp *gpu_props;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define WARP_SIZE 32
 | 
			
		||||
__device__ unsigned int retirementCount = 0;
 | 
			
		||||
 | 
			
		||||
template <class Iterator>
 | 
			
		||||
@@ -19,7 +25,12 @@ template <class Iterator>
 | 
			
		||||
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
 | 
			
		||||
  
 | 
			
		||||
  int device;
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
  cudaGetDevice(&device);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
  hipGetDevice(&device);
 | 
			
		||||
#endif
 | 
			
		||||
  
 | 
			
		||||
  Iterator warpSize            = gpu_props[device].warpSize;
 | 
			
		||||
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
 | 
			
		||||
@@ -147,7 +158,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 | 
			
		||||
    sobj *smem = (sobj *)shmem_pointer;
 | 
			
		||||
    
 | 
			
		||||
    // wait until all outstanding memory instructions in this thread are finished
 | 
			
		||||
    __threadfence();
 | 
			
		||||
    acceleratorFence();
 | 
			
		||||
    
 | 
			
		||||
    if (tid==0) {
 | 
			
		||||
      unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
 | 
			
		||||
@@ -156,8 +167,8 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    // each thread must read the correct value of amLast
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    
 | 
			
		||||
    acceleratorSynchroniseAll();
 | 
			
		||||
 | 
			
		||||
    if (amLast) {
 | 
			
		||||
      // reduce buffer[0], ..., buffer[gridDim.x-1]
 | 
			
		||||
      Iterator i = tid;
 | 
			
		||||
@@ -199,13 +210,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 | 
			
		||||
  sobj *buffer_v = &buffer[0];
 | 
			
		||||
  
 | 
			
		||||
  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
 | 
			
		||||
  cudaDeviceSynchronize();
 | 
			
		||||
  
 | 
			
		||||
  cudaError err = cudaGetLastError();
 | 
			
		||||
  if ( cudaSuccess != err ) {
 | 
			
		||||
    printf("Cuda error %s\n",cudaGetErrorString( err ));
 | 
			
		||||
    exit(0);
 | 
			
		||||
  }
 | 
			
		||||
  accelerator_barrier();
 | 
			
		||||
  auto result = buffer_v[0];
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -375,7 +375,7 @@ public:
 | 
			
		||||
    int osites = _grid->oSites();  // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
 | 
			
		||||
    int words  = sizeof(scalar_object) / sizeof(scalar_type);
 | 
			
		||||
 | 
			
		||||
    auto l_v = l.View();
 | 
			
		||||
    autoView(l_v, l, CpuWrite);
 | 
			
		||||
    thread_for( ss, osites, {
 | 
			
		||||
      ExtractBuffer<scalar_object> buf(Nsimd);
 | 
			
		||||
      for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
 | 
			
		||||
@@ -461,8 +461,8 @@ public:
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    {
 | 
			
		||||
      // Obtain one reseeded generator per thread
 | 
			
		||||
      int Nthread = GridThread::GetThreads();
 | 
			
		||||
      // Obtain one reseeded generator per thread      
 | 
			
		||||
      int Nthread = 32; // Hardwire a good level or parallelism
 | 
			
		||||
      std::vector<RngEngine> seeders(Nthread);
 | 
			
		||||
      for(int t=0;t<Nthread;t++){
 | 
			
		||||
	seeders[t] = Reseed(master_engine);
 | 
			
		||||
 
 | 
			
		||||
@@ -42,8 +42,8 @@ template<class vobj>
 | 
			
		||||
inline auto trace(const Lattice<vobj> &lhs)  -> Lattice<decltype(trace(vobj()))>
 | 
			
		||||
{
 | 
			
		||||
  Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  autoView(ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView(lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
 | 
			
		||||
    coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
 | 
			
		||||
  });
 | 
			
		||||
@@ -58,8 +58,8 @@ template<int Index,class vobj>
 | 
			
		||||
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
 | 
			
		||||
{
 | 
			
		||||
  Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  autoView( ret_v , ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
 | 
			
		||||
    coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
 | 
			
		||||
  });
 | 
			
		||||
 
 | 
			
		||||
@@ -47,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// remove and insert a half checkerboard
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
 | 
			
		||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
 | 
			
		||||
{
 | 
			
		||||
  half.Checkerboard() = cb;
 | 
			
		||||
 | 
			
		||||
  auto half_v = half.View();
 | 
			
		||||
  auto full_v = full.View();
 | 
			
		||||
  autoView( half_v, half, CpuWrite);
 | 
			
		||||
  autoView( full_v, full, CpuRead);
 | 
			
		||||
  thread_for(ss, full.Grid()->oSites(),{
 | 
			
		||||
    int cbos;
 | 
			
		||||
    Coordinate coor;
 | 
			
		||||
@@ -64,11 +65,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
 | 
			
		||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
 | 
			
		||||
{
 | 
			
		||||
  int cb = half.Checkerboard();
 | 
			
		||||
  auto half_v = half.View();
 | 
			
		||||
  auto full_v = full.View();
 | 
			
		||||
  autoView( half_v , half, CpuRead);
 | 
			
		||||
  autoView( full_v , full, CpuWrite);
 | 
			
		||||
  thread_for(ss,full.Grid()->oSites(),{
 | 
			
		||||
 | 
			
		||||
    Coordinate coor;
 | 
			
		||||
@@ -96,15 +97,15 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
 | 
			
		||||
  out = in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef __CUDA_ARCH__
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
 | 
			
		||||
  ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
 | 
			
		||||
  ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
 | 
			
		||||
}
 | 
			
		||||
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
 | 
			
		||||
  ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
 | 
			
		||||
  ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in;
 | 
			
		||||
}
 | 
			
		||||
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
 | 
			
		||||
  ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
 | 
			
		||||
  ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in;
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@@ -151,12 +152,11 @@ accelerator_inline void convertType(T & out, const T & in) {
 | 
			
		||||
 | 
			
		||||
template<typename T1,typename T2>
 | 
			
		||||
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
 | 
			
		||||
  auto out_v = out.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto in_v  = in.AcceleratorView(ViewRead);
 | 
			
		||||
 | 
			
		||||
  autoView( out_v , out,AcceleratorWrite);
 | 
			
		||||
  autoView( in_v  , in ,AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,out_v.size(),T1::Nsimd(),{
 | 
			
		||||
      convertType(out_v[ss],in_v(ss));
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -164,19 +164,20 @@ accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
 | 
			
		||||
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
 | 
			
		||||
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
 | 
			
		||||
{
 | 
			
		||||
  auto lhs_v = lhs.AcceleratorView(ViewRead);
 | 
			
		||||
  auto rhs_v = rhs.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( lhs_v , lhs, AcceleratorRead);
 | 
			
		||||
  autoView( rhs_v , rhs, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
  typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
 | 
			
		||||
  Lattice<iScalar<t_inner>> ret(lhs.Grid());
 | 
			
		||||
  auto ret_v = ret.AcceleratorView(ViewWrite);
 | 
			
		||||
 | 
			
		||||
  accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
 | 
			
		||||
  {
 | 
			
		||||
    autoView(ret_v, ret,AcceleratorWrite);
 | 
			
		||||
    accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
 | 
			
		||||
      convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -194,14 +195,13 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
  Lattice<iScalar<CComplex>> ip(coarse);
 | 
			
		||||
  Lattice<vobj>     fineDataRed = fineData;
 | 
			
		||||
 | 
			
		||||
  //  auto fineData_   = fineData.View();
 | 
			
		||||
  auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto ip_         = ip.AcceleratorView(ViewReadWrite);
 | 
			
		||||
  autoView( coarseData_ , coarseData, AcceleratorWrite);
 | 
			
		||||
  autoView( ip_         , ip,         AcceleratorWrite);
 | 
			
		||||
  for(int v=0;v<nbasis;v++) {
 | 
			
		||||
    blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
 | 
			
		||||
    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
 | 
			
		||||
	convertType(coarseData_[sc](v),ip_[sc]);
 | 
			
		||||
      });
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    // improve numerical stability of projection
 | 
			
		||||
    // |fine> = |fine> - <basis|fine> |basis>
 | 
			
		||||
@@ -210,68 +210,6 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj,class CComplex,int nbasis>
 | 
			
		||||
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
			 const             Lattice<vobj>   &fineData,
 | 
			
		||||
			 const std::vector<Lattice<vobj> > &Basis)
 | 
			
		||||
{
 | 
			
		||||
  typedef iVector<CComplex,nbasis > coarseSiteData;
 | 
			
		||||
  coarseSiteData elide;
 | 
			
		||||
  typedef decltype(coalescedRead(elide)) ScalarComplex;
 | 
			
		||||
  GridBase * fine  = fineData.Grid();
 | 
			
		||||
  GridBase * coarse= coarseData.Grid();
 | 
			
		||||
  int  _ndimension = coarse->_ndimension;
 | 
			
		||||
 | 
			
		||||
  // checks
 | 
			
		||||
  assert( nbasis == Basis.size() );
 | 
			
		||||
  subdivides(coarse,fine); 
 | 
			
		||||
  for(int i=0;i<nbasis;i++){
 | 
			
		||||
    conformable(Basis[i],fineData);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Coordinate block_r      (_ndimension);
 | 
			
		||||
  
 | 
			
		||||
  for(int d=0 ; d<_ndimension;d++){
 | 
			
		||||
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
 | 
			
		||||
    assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
 | 
			
		||||
  }
 | 
			
		||||
  int blockVol = fine->oSites()/coarse->oSites();
 | 
			
		||||
 | 
			
		||||
  coarseData=Zero();
 | 
			
		||||
 | 
			
		||||
  auto fineData_   = fineData.View();
 | 
			
		||||
  auto coarseData_ = coarseData.View();
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
 | 
			
		||||
  // Otherwise do fine inner product per site, and make the update atomic
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
 | 
			
		||||
 | 
			
		||||
    auto sc=sci/nbasis;
 | 
			
		||||
    auto i=sci%nbasis;
 | 
			
		||||
    auto Basis_      = Basis[i].View();
 | 
			
		||||
 | 
			
		||||
    Coordinate coor_c(_ndimension);
 | 
			
		||||
    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
 | 
			
		||||
 | 
			
		||||
    int sf;
 | 
			
		||||
    decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
 | 
			
		||||
 | 
			
		||||
    for(int sb=0;sb<blockVol;sb++){
 | 
			
		||||
 | 
			
		||||
      Coordinate coor_b(_ndimension);
 | 
			
		||||
      Coordinate coor_f(_ndimension);
 | 
			
		||||
 | 
			
		||||
      Lexicographic::CoorFromIndex(coor_b,sb,block_r);
 | 
			
		||||
      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
 | 
			
		||||
      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
 | 
			
		||||
      
 | 
			
		||||
      reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
 | 
			
		||||
    }
 | 
			
		||||
    coalescedWrite(coarseData_[sc](i),reduce);
 | 
			
		||||
  });
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj,class vobj2,class CComplex>
 | 
			
		||||
  inline void blockZAXPY(Lattice<vobj> &fineZ,
 | 
			
		||||
@@ -298,10 +236,10 @@ template<class vobj,class vobj2,class CComplex>
 | 
			
		||||
    assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  auto fineZ_  = fineZ.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto fineX_  = fineX.AcceleratorView(ViewRead);
 | 
			
		||||
  auto fineY_  = fineY.AcceleratorView(ViewRead);
 | 
			
		||||
  auto coarseA_= coarseA.AcceleratorView(ViewRead);
 | 
			
		||||
  autoView( fineZ_  , fineZ, AcceleratorWrite);
 | 
			
		||||
  autoView( fineX_  , fineX, AcceleratorRead);
 | 
			
		||||
  autoView( fineY_  , fineY, AcceleratorRead);
 | 
			
		||||
  autoView( coarseA_, coarseA, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
  accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
 | 
			
		||||
 | 
			
		||||
@@ -314,7 +252,7 @@ template<class vobj,class vobj2,class CComplex>
 | 
			
		||||
      Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
 | 
			
		||||
      // z = A x + y
 | 
			
		||||
#ifdef __CUDA_ARCH__
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
      typename vobj2::tensor_reduced::scalar_object cA;
 | 
			
		||||
      typename vobj::scalar_object cAx;
 | 
			
		||||
#else
 | 
			
		||||
@@ -344,15 +282,16 @@ template<class vobj,class CComplex>
 | 
			
		||||
  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
 | 
			
		||||
  Lattice<dotp> coarse_inner(coarse);
 | 
			
		||||
 | 
			
		||||
  auto CoarseInner_  = CoarseInner.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
 | 
			
		||||
 | 
			
		||||
  // Precision promotion
 | 
			
		||||
  fine_inner = localInnerProductD(fineX,fineY);
 | 
			
		||||
  fine_inner = localInnerProductD<vobj>(fineX,fineY);
 | 
			
		||||
  blockSum(coarse_inner,fine_inner);
 | 
			
		||||
  accelerator_for(ss, coarse->oSites(), 1, {
 | 
			
		||||
  {
 | 
			
		||||
    autoView( CoarseInner_  , CoarseInner,AcceleratorWrite);
 | 
			
		||||
    autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
 | 
			
		||||
    accelerator_for(ss, coarse->oSites(), 1, {
 | 
			
		||||
      convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -370,14 +309,15 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
 | 
			
		||||
  Lattice<dotp> coarse_inner(coarse);
 | 
			
		||||
 | 
			
		||||
  // Precision promotion?
 | 
			
		||||
  auto CoarseInner_  = CoarseInner.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
 | 
			
		||||
 | 
			
		||||
  fine_inner = localInnerProduct(fineX,fineY);
 | 
			
		||||
  blockSum(coarse_inner,fine_inner);
 | 
			
		||||
  accelerator_for(ss, coarse->oSites(), 1, {
 | 
			
		||||
    CoarseInner_[ss] = coarse_inner_[ss];
 | 
			
		||||
  });
 | 
			
		||||
  {
 | 
			
		||||
    autoView( CoarseInner_  , CoarseInner, AcceleratorWrite);
 | 
			
		||||
    autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
 | 
			
		||||
    accelerator_for(ss, coarse->oSites(), 1, {
 | 
			
		||||
	CoarseInner_[ss] = coarse_inner_[ss];
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj,class CComplex>
 | 
			
		||||
@@ -408,8 +348,10 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 | 
			
		||||
  }
 | 
			
		||||
  int blockVol = fine->oSites()/coarse->oSites();
 | 
			
		||||
 | 
			
		||||
  auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
 | 
			
		||||
  auto fineData_   = fineData.AcceleratorView(ViewRead);
 | 
			
		||||
  // Turn this around to loop threaded over sc and interior loop 
 | 
			
		||||
  // over sf would thread better
 | 
			
		||||
  autoView( coarseData_ , coarseData, AcceleratorWrite);
 | 
			
		||||
  autoView( fineData_   , fineData, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
  accelerator_for(sc,coarse->oSites(),1,{
 | 
			
		||||
 | 
			
		||||
@@ -510,8 +452,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
  for(int d=0 ; d<_ndimension;d++){
 | 
			
		||||
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
 | 
			
		||||
  }
 | 
			
		||||
  auto fineData_   = fineData.View();
 | 
			
		||||
  auto coarseData_ = coarseData.View();
 | 
			
		||||
  autoView( fineData_   , fineData, AcceleratorWrite);
 | 
			
		||||
  autoView( coarseData_ , coarseData, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
  // Loop with a cache friendly loop ordering
 | 
			
		||||
  accelerator_for(sf,fine->oSites(),1,{
 | 
			
		||||
@@ -524,7 +466,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
 | 
			
		||||
    for(int i=0;i<nbasis;i++) {
 | 
			
		||||
      auto basis_ = Basis[i].View();
 | 
			
		||||
      /*      auto basis_ = Basis[i],  );*/
 | 
			
		||||
      if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
 | 
			
		||||
      else     fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
 | 
			
		||||
    }
 | 
			
		||||
@@ -543,7 +485,14 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
  fineData=Zero();
 | 
			
		||||
  for(int i=0;i<nbasis;i++) {
 | 
			
		||||
    Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
 | 
			
		||||
    auto  ip_ =  ip.AcceleratorView(ViewRead);
 | 
			
		||||
 | 
			
		||||
    //Lattice<CComplex> cip(coarse);
 | 
			
		||||
    //autoView( cip_ , cip, AcceleratorWrite);
 | 
			
		||||
    //autoView(  ip_ ,  ip, AcceleratorRead);
 | 
			
		||||
    //accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
 | 
			
		||||
    //	coalescedWrite(cip_[sc], ip_(sc)());
 | 
			
		||||
    //  });
 | 
			
		||||
    //blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
 | 
			
		||||
    blockZAXPY(fineData,ip,Basis[i],fineData);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -571,15 +520,17 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
 | 
			
		||||
    assert(ig->lSites() == og->lSites());
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  autoView(in_v,in,CpuRead);
 | 
			
		||||
  autoView(out_v,out,CpuWrite);
 | 
			
		||||
  thread_for(idx, ig->lSites(),{
 | 
			
		||||
    sobj s;
 | 
			
		||||
    ssobj ss;
 | 
			
		||||
 | 
			
		||||
    Coordinate lcoor(ni);
 | 
			
		||||
    ig->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    peekLocalSite(s,in,lcoor);
 | 
			
		||||
    peekLocalSite(s,in_v,lcoor);
 | 
			
		||||
    ss=s;
 | 
			
		||||
    pokeLocalSite(ss,out,lcoor);
 | 
			
		||||
    pokeLocalSite(ss,out_v,lcoor);
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -614,8 +565,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 | 
			
		||||
  Coordinate rdt = Tg->_rdimensions;
 | 
			
		||||
  Coordinate ist = Tg->_istride;
 | 
			
		||||
  Coordinate ost = Tg->_ostride;
 | 
			
		||||
  auto t_v = To.AcceleratorView(ViewWrite);
 | 
			
		||||
  auto f_v = From.AcceleratorView(ViewRead);
 | 
			
		||||
 | 
			
		||||
  autoView( t_v , To, AcceleratorWrite);
 | 
			
		||||
  autoView( f_v , From, AcceleratorRead);
 | 
			
		||||
  accelerator_for(idx,Fg->lSites(),1,{
 | 
			
		||||
    sobj s;
 | 
			
		||||
    Coordinate Fcoor(nd);
 | 
			
		||||
@@ -638,8 +590,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 | 
			
		||||
      for(int w=0;w<words;w++){
 | 
			
		||||
	tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd];  // FIXME IF RRII layout, type pun no worke
 | 
			
		||||
      }
 | 
			
		||||
      //      peekLocalSite(s,From,Fcoor);
 | 
			
		||||
      //      pokeLocalSite(s,To  ,Tcoor);
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
@@ -670,6 +620,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  autoView(lowDimv,lowDim,CpuRead);
 | 
			
		||||
  autoView(higherDimv,higherDim,CpuWrite);
 | 
			
		||||
  thread_for(idx,lg->lSites(),{
 | 
			
		||||
    sobj s;
 | 
			
		||||
    Coordinate lcoor(nl);
 | 
			
		||||
@@ -682,8 +634,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
 | 
			
		||||
	hcoor[d]=lcoor[ddl++];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    peekLocalSite(s,lowDim,lcoor);
 | 
			
		||||
    pokeLocalSite(s,higherDim,hcoor);
 | 
			
		||||
    peekLocalSite(s,lowDimv,lcoor);
 | 
			
		||||
    pokeLocalSite(s,higherDimv,hcoor);
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -711,6 +663,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  autoView(lowDimv,lowDim,CpuWrite);
 | 
			
		||||
  autoView(higherDimv,higherDim,CpuRead);
 | 
			
		||||
  thread_for(idx,lg->lSites(),{
 | 
			
		||||
    sobj s;
 | 
			
		||||
    Coordinate lcoor(nl);
 | 
			
		||||
@@ -723,8 +677,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 | 
			
		||||
	hcoor[d]=lcoor[ddl++];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    peekLocalSite(s,higherDim,hcoor);
 | 
			
		||||
    pokeLocalSite(s,lowDim,lcoor);
 | 
			
		||||
    peekLocalSite(s,higherDimv,hcoor);
 | 
			
		||||
    pokeLocalSite(s,lowDimv,lcoor);
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
@@ -752,6 +706,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  autoView(lowDimv,lowDim,CpuRead);
 | 
			
		||||
  autoView(higherDimv,higherDim,CpuWrite);
 | 
			
		||||
  thread_for(idx,lg->lSites(),{
 | 
			
		||||
    sobj s;
 | 
			
		||||
    Coordinate lcoor(nl);
 | 
			
		||||
@@ -760,8 +716,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
 | 
			
		||||
    if( lcoor[orthog] == slice_lo ) { 
 | 
			
		||||
      hcoor=lcoor;
 | 
			
		||||
      hcoor[orthog] = slice_hi;
 | 
			
		||||
      peekLocalSite(s,lowDim,lcoor);
 | 
			
		||||
      pokeLocalSite(s,higherDim,hcoor);
 | 
			
		||||
      peekLocalSite(s,lowDimv,lcoor);
 | 
			
		||||
      pokeLocalSite(s,higherDimv,hcoor);
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
@@ -789,6 +745,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // the above should guarantee that the operations are local
 | 
			
		||||
  autoView(lowDimv,lowDim,CpuWrite);
 | 
			
		||||
  autoView(higherDimv,higherDim,CpuRead);
 | 
			
		||||
  thread_for(idx,lg->lSites(),{
 | 
			
		||||
    sobj s;
 | 
			
		||||
    Coordinate lcoor(nl);
 | 
			
		||||
@@ -797,8 +755,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
 | 
			
		||||
    if( lcoor[orthog] == slice_lo ) { 
 | 
			
		||||
      hcoor=lcoor;
 | 
			
		||||
      hcoor[orthog] = slice_hi;
 | 
			
		||||
      peekLocalSite(s,higherDim,hcoor);
 | 
			
		||||
      pokeLocalSite(s,lowDim,lcoor);
 | 
			
		||||
      peekLocalSite(s,higherDimv,hcoor);
 | 
			
		||||
      pokeLocalSite(s,lowDimv,lcoor);
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
@@ -862,7 +820,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //loop over outer index
 | 
			
		||||
  auto in_v  = in.View();
 | 
			
		||||
  autoView( in_v  , in, CpuRead);
 | 
			
		||||
  thread_for(in_oidx,in_grid->oSites(),{
 | 
			
		||||
    //Assemble vector of pointers to output elements
 | 
			
		||||
    ExtractPointerArray<sobj> out_ptrs(in_nsimd);
 | 
			
		||||
@@ -955,7 +913,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
 | 
			
		||||
    icoor[lane].resize(ndim);
 | 
			
		||||
    grid->iCoorFromIindex(icoor[lane],lane);
 | 
			
		||||
  }
 | 
			
		||||
  auto out_v = out.View();
 | 
			
		||||
  autoView( out_v , out, CpuWrite);
 | 
			
		||||
  thread_for(oidx, grid->oSites(),{
 | 
			
		||||
    //Assemble vector of pointers to output elements
 | 
			
		||||
    ExtractPointerArray<sobj> ptrs(nsimd);
 | 
			
		||||
@@ -1058,7 +1016,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 | 
			
		||||
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
 | 
			
		||||
  unvectorizeToLexOrdArray(in_slex_conv, in);
 | 
			
		||||
    
 | 
			
		||||
  auto out_v = out.View();
 | 
			
		||||
  autoView( out_v , out, CpuWrite);
 | 
			
		||||
  thread_for(out_oidx,out_grid->oSites(),{
 | 
			
		||||
    Coordinate out_ocoor(ndim);
 | 
			
		||||
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
 | 
			
		||||
 
 | 
			
		||||
@@ -42,8 +42,8 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
 | 
			
		||||
  Lattice<vobj> ret(lhs.Grid());
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  autoView( ret_v, ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v, lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
 | 
			
		||||
    coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
 | 
			
		||||
  });
 | 
			
		||||
@@ -58,8 +58,8 @@ template<int Index,class vobj>
 | 
			
		||||
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
 | 
			
		||||
{
 | 
			
		||||
  Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
 | 
			
		||||
  auto ret_v = ret.View();
 | 
			
		||||
  auto lhs_v = lhs.View();
 | 
			
		||||
  autoView( ret_v, ret, AcceleratorWrite);
 | 
			
		||||
  autoView( lhs_v, lhs, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
 | 
			
		||||
    coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
 | 
			
		||||
  });
 | 
			
		||||
 
 | 
			
		||||
@@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
 | 
			
		||||
  Lattice<obj> ret_i(rhs_i.Grid());
 | 
			
		||||
  auto rhs = rhs_i.View();
 | 
			
		||||
  auto ret = ret_i.View();
 | 
			
		||||
  autoView( rhs, rhs_i, AcceleratorRead);
 | 
			
		||||
  autoView( ret, ret_i, AcceleratorWrite);
 | 
			
		||||
  ret.Checkerboard() = rhs.Checkerboard();
 | 
			
		||||
  accelerator_for(ss,rhs.size(),1,{
 | 
			
		||||
      ret[ss]=pow(rhs[ss],y);
 | 
			
		||||
@@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
 | 
			
		||||
}
 | 
			
		||||
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
 | 
			
		||||
  Lattice<obj> ret_i(rhs_i.Grid());
 | 
			
		||||
  auto rhs = rhs_i.View();
 | 
			
		||||
  auto ret = ret_i.View();
 | 
			
		||||
  autoView( rhs , rhs_i, AcceleratorRead);
 | 
			
		||||
  autoView( ret , ret_i, AcceleratorWrite);
 | 
			
		||||
  ret.Checkerboard() = rhs.Checkerboard();
 | 
			
		||||
  accelerator_for(ss,rhs.size(),obj::Nsimd(),{
 | 
			
		||||
    coalescedWrite(ret[ss],mod(rhs(ss),y));
 | 
			
		||||
@@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
 | 
			
		||||
 | 
			
		||||
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
 | 
			
		||||
  Lattice<obj> ret_i(rhs_i.Grid());
 | 
			
		||||
  auto ret = ret_i.View();
 | 
			
		||||
  auto rhs = rhs_i.View();
 | 
			
		||||
  autoView( ret , ret_i, AcceleratorWrite);
 | 
			
		||||
  autoView( rhs , rhs_i, AcceleratorRead);
 | 
			
		||||
  ret.Checkerboard() = rhs_i.Checkerboard();
 | 
			
		||||
  accelerator_for(ss,rhs.size(),obj::Nsimd(),{
 | 
			
		||||
    coalescedWrite(ret[ss],div(rhs(ss),y));
 | 
			
		||||
@@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
 | 
			
		||||
 | 
			
		||||
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
 | 
			
		||||
  Lattice<obj> ret_i(rhs_i.Grid());
 | 
			
		||||
  auto rhs = rhs_i.View();
 | 
			
		||||
  auto ret = ret_i.View();
 | 
			
		||||
  autoView( rhs , rhs_i, AcceleratorRead);
 | 
			
		||||
  autoView( ret , ret_i, AcceleratorWrite);
 | 
			
		||||
  ret.Checkerboard() = rhs.Checkerboard();
 | 
			
		||||
  accelerator_for(ss,rhs.size(),obj::Nsimd(),{
 | 
			
		||||
    coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										168
									
								
								Grid/lattice/Lattice_view.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								Grid/lattice/Lattice_view.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,168 @@
 | 
			
		||||
#pragma once
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Base class which can be used by traits to pick up behaviour
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
class LatticeBase {};
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Conformable checks; same instance of Grid required
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
 | 
			
		||||
{
 | 
			
		||||
  assert(lhs == rhs);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Minimal base class containing only data valid to access from accelerator
 | 
			
		||||
// _odata will be a managed pointer in CUDA
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Force access to lattice through a view object.
 | 
			
		||||
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
 | 
			
		||||
// strict since host could could in principle direct access through the lattice object
 | 
			
		||||
// Need to decide programming model.
 | 
			
		||||
#define LATTICE_VIEW_STRICT
 | 
			
		||||
template<class vobj> class LatticeAccelerator : public LatticeBase
 | 
			
		||||
{
 | 
			
		||||
protected:
 | 
			
		||||
  //public:
 | 
			
		||||
  GridBase *_grid;
 | 
			
		||||
  int checkerboard;
 | 
			
		||||
  vobj     *_odata;    // A managed pointer
 | 
			
		||||
  uint64_t _odata_size;    
 | 
			
		||||
  ViewAdvise advise;
 | 
			
		||||
public:
 | 
			
		||||
  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { }; 
 | 
			
		||||
  accelerator_inline uint64_t oSites(void) const { return _odata_size; };
 | 
			
		||||
  accelerator_inline int  Checkerboard(void) const { return checkerboard; };
 | 
			
		||||
  accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
 | 
			
		||||
  accelerator_inline ViewAdvise Advise(void) const { return advise; };
 | 
			
		||||
  accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
 | 
			
		||||
  accelerator_inline void Conformable(GridBase * &grid) const
 | 
			
		||||
  { 
 | 
			
		||||
    if (grid) conformable(grid, _grid);
 | 
			
		||||
    else      grid = _grid;
 | 
			
		||||
  };
 | 
			
		||||
  // Host only
 | 
			
		||||
  GridBase * getGrid(void) const { return _grid; };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// A View class which provides accessor to the data.
 | 
			
		||||
// This will be safe to call from accelerator_for and is trivially copy constructible
 | 
			
		||||
// The copy constructor for this will need to be used by device lambda functions
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> 
 | 
			
		||||
class LatticeView : public LatticeAccelerator<vobj>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  // Rvalue
 | 
			
		||||
  ViewMode mode;
 | 
			
		||||
  void * cpu_ptr;
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
  accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { 
 | 
			
		||||
    return coalescedRead(this->_odata[i]); 
 | 
			
		||||
  }
 | 
			
		||||
#else 
 | 
			
		||||
  accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
 | 
			
		||||
  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
 | 
			
		||||
 | 
			
		||||
  accelerator_inline uint64_t begin(void) const { return 0;};
 | 
			
		||||
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
 | 
			
		||||
  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };
 | 
			
		||||
 | 
			
		||||
  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
 | 
			
		||||
  LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
 | 
			
		||||
  LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
 | 
			
		||||
  {
 | 
			
		||||
    this->ViewOpen(mode);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Host functions
 | 
			
		||||
  void ViewOpen(ViewMode mode)
 | 
			
		||||
  { // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
 | 
			
		||||
    //    std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
 | 
			
		||||
    this->cpu_ptr = (void *)this->_odata;
 | 
			
		||||
    this->mode    = mode;
 | 
			
		||||
    this->_odata  =(vobj *)
 | 
			
		||||
      MemoryManager::ViewOpen(this->cpu_ptr,
 | 
			
		||||
				this->_odata_size*sizeof(vobj),
 | 
			
		||||
				mode,
 | 
			
		||||
				this->advise);    
 | 
			
		||||
  }
 | 
			
		||||
  void ViewClose(void)
 | 
			
		||||
  { // Inform the manager
 | 
			
		||||
    //    std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
 | 
			
		||||
    MemoryManager::ViewClose(this->cpu_ptr,this->mode);    
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
// Little autoscope assister
 | 
			
		||||
template<class View> 
 | 
			
		||||
class ViewCloser
 | 
			
		||||
{
 | 
			
		||||
  View v;  // Take a copy of view and call view close when I go out of scope automatically
 | 
			
		||||
 public:
 | 
			
		||||
  ViewCloser(View &_v) : v(_v) {};
 | 
			
		||||
  ~ViewCloser() { v.ViewClose(); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#define autoView(l_v,l,mode)				\
 | 
			
		||||
	  auto l_v = l.View(mode);			\
 | 
			
		||||
	  ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Lattice expression types used by ET to assemble the AST
 | 
			
		||||
// 
 | 
			
		||||
// Need to be able to detect code paths according to the whether a lattice object or not
 | 
			
		||||
// so introduce some trait type things
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
class LatticeExpressionBase {};
 | 
			
		||||
 | 
			
		||||
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
 | 
			
		||||
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
 | 
			
		||||
 | 
			
		||||
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
 | 
			
		||||
template<class T>                 struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
 | 
			
		||||
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename _T1>                           
 | 
			
		||||
class LatticeUnaryExpression : public  LatticeExpressionBase 
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  typedef typename ViewMap<_T1>::Type T1;
 | 
			
		||||
  Op op;
 | 
			
		||||
  T1 arg1;
 | 
			
		||||
  LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename _T1, typename _T2>              
 | 
			
		||||
class LatticeBinaryExpression : public LatticeExpressionBase 
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  typedef typename ViewMap<_T1>::Type T1;
 | 
			
		||||
  typedef typename ViewMap<_T2>::Type T2;
 | 
			
		||||
  Op op;
 | 
			
		||||
  T1 arg1;
 | 
			
		||||
  T2 arg2;
 | 
			
		||||
  LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename _T1, typename _T2, typename _T3> 
 | 
			
		||||
class LatticeTrinaryExpression : public LatticeExpressionBase 
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  typedef typename ViewMap<_T1>::Type T1;
 | 
			
		||||
  typedef typename ViewMap<_T2>::Type T2;
 | 
			
		||||
  typedef typename ViewMap<_T3>::Type T3;
 | 
			
		||||
  Op op;
 | 
			
		||||
  T1 arg1;
 | 
			
		||||
  T2 arg2;
 | 
			
		||||
  T3 arg3;
 | 
			
		||||
  LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
 | 
			
		||||
};
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
@@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <sys/syscall.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef __x86_64__
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
accelerator_inline uint64_t __rdtsc(void) {  return 0; }
 | 
			
		||||
accelerator_inline uint64_t __rdpmc(int ) {  return 0; }
 | 
			
		||||
#else
 | 
			
		||||
@@ -112,7 +112,6 @@ class PerformanceCounter {
 | 
			
		||||
private:
 | 
			
		||||
 | 
			
		||||
  typedef struct { 
 | 
			
		||||
  public:
 | 
			
		||||
    uint32_t type;
 | 
			
		||||
    uint64_t config;
 | 
			
		||||
    const char *name;
 | 
			
		||||
 
 | 
			
		||||
@@ -12773,7 +12773,7 @@ namespace pugi
 | 
			
		||||
#undef PUGI__THROW_ERROR
 | 
			
		||||
#undef PUGI__CHECK_ERROR
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
#pragma pop
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -114,19 +114,22 @@ public:
 | 
			
		||||
      U = adj(Cshift(U, mu, -1));
 | 
			
		||||
      PokeIndex<LorentzIndex>(Uadj, U, mu);
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
 | 
			
		||||
 | 
			
		||||
    autoView(Umu_v,Umu,CpuRead);
 | 
			
		||||
    autoView(Uadj_v,Uadj,CpuRead);
 | 
			
		||||
    autoView(Uds_v,Uds,CpuWrite);
 | 
			
		||||
    thread_for( lidx, GaugeGrid->lSites(), {
 | 
			
		||||
      Coordinate lcoor;
 | 
			
		||||
      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
 | 
			
		||||
      
 | 
			
		||||
      peekLocalSite(ScalarUmu, Umu, lcoor);
 | 
			
		||||
      peekLocalSite(ScalarUmu, Umu_v, lcoor);
 | 
			
		||||
      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
 | 
			
		||||
      
 | 
			
		||||
      peekLocalSite(ScalarUmu, Uadj, lcoor);
 | 
			
		||||
      peekLocalSite(ScalarUmu, Uadj_v, lcoor);
 | 
			
		||||
      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
 | 
			
		||||
      
 | 
			
		||||
      pokeLocalSite(ScalarUds, Uds, lcoor);
 | 
			
		||||
    }
 | 
			
		||||
      pokeLocalSite(ScalarUds, Uds_v, lcoor);
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
      
 | 
			
		||||
  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
 | 
			
		||||
 
 | 
			
		||||
@@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
 | 
			
		||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 | 
			
		||||
NAMESPACE_CHECK(Wilson5D);
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 | 
			
		||||
NAMESPACE_CHECK(Staggered);
 | 
			
		||||
@@ -282,11 +283,15 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 | 
			
		||||
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 | 
			
		||||
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
 | 
			
		||||
 | 
			
		||||
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
 | 
			
		||||
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
 | 
			
		||||
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
 | 
			
		||||
 | 
			
		||||
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 | 
			
		||||
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 | 
			
		||||
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
 | 
			
		||||
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
 | 
			
		||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
 | 
			
		||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
 | 
			
		||||
 
 | 
			
		||||
@@ -96,11 +96,11 @@ public:
 | 
			
		||||
    int sl        = St._simd_layout[direction];
 | 
			
		||||
    Coordinate icoor;
 | 
			
		||||
 | 
			
		||||
#ifdef __CUDA_ARCH__
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
    _Spinor tmp;
 | 
			
		||||
 | 
			
		||||
    const int Nsimd =SiteDoubledGaugeField::Nsimd();
 | 
			
		||||
    int s = SIMTlane(Nsimd);
 | 
			
		||||
    int s = acceleratorSIMTlane(Nsimd);
 | 
			
		||||
    St.iCoorFromIindex(icoor,s);
 | 
			
		||||
 | 
			
		||||
    int mmu = mu % Nd;
 | 
			
		||||
@@ -232,15 +232,17 @@ public:
 | 
			
		||||
      if ( Params.twists[mu] ) { 
 | 
			
		||||
	Uconj = where(coor==neglink,-Uconj,Uconj);
 | 
			
		||||
      }
 | 
			
		||||
	  
 | 
			
		||||
      auto U_v = U.View();
 | 
			
		||||
      auto Uds_v = Uds.View();
 | 
			
		||||
      auto Uconj_v = Uconj.View();
 | 
			
		||||
      auto Utmp_v= Utmp.View();
 | 
			
		||||
      thread_foreach(ss,U_v,{
 | 
			
		||||
	Uds_v[ss](0)(mu) = U_v[ss]();
 | 
			
		||||
	Uds_v[ss](1)(mu) = Uconj_v[ss]();
 | 
			
		||||
      });
 | 
			
		||||
 | 
			
		||||
      {
 | 
			
		||||
	autoView( U_v , U, CpuRead);
 | 
			
		||||
	autoView( Uconj_v , Uconj, CpuRead);
 | 
			
		||||
	autoView( Uds_v , Uds, CpuWrite);
 | 
			
		||||
	autoView( Utmp_v, Utmp, CpuWrite);
 | 
			
		||||
	thread_foreach(ss,U_v,{
 | 
			
		||||
	    Uds_v[ss](0)(mu) = U_v[ss]();
 | 
			
		||||
	    Uds_v[ss](1)(mu) = Uconj_v[ss]();
 | 
			
		||||
	  });
 | 
			
		||||
      }
 | 
			
		||||
          
 | 
			
		||||
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
 | 
			
		||||
      Uconj = adj(Cshift(Uconj,mu,-1));
 | 
			
		||||
@@ -250,19 +252,25 @@ public:
 | 
			
		||||
	Utmp = where(coor==0,Uconj,Utmp);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      thread_foreach(ss,Utmp_v,{
 | 
			
		||||
	Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
 | 
			
		||||
      });
 | 
			
		||||
          
 | 
			
		||||
      {
 | 
			
		||||
	autoView( Uds_v , Uds, CpuWrite);
 | 
			
		||||
	autoView( Utmp_v, Utmp, CpuWrite);
 | 
			
		||||
	thread_foreach(ss,Utmp_v,{
 | 
			
		||||
	    Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
 | 
			
		||||
	  });
 | 
			
		||||
      }
 | 
			
		||||
      Utmp = Uconj;
 | 
			
		||||
      if ( Params.twists[mu] ) { 
 | 
			
		||||
	Utmp = where(coor==0,U,Utmp);
 | 
			
		||||
      }
 | 
			
		||||
	  
 | 
			
		||||
      thread_foreach(ss,Utmp_v,{
 | 
			
		||||
        Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
 | 
			
		||||
      });
 | 
			
		||||
          
 | 
			
		||||
 | 
			
		||||
      {	  
 | 
			
		||||
	autoView( Uds_v , Uds, CpuWrite);
 | 
			
		||||
	autoView( Utmp_v, Utmp, CpuWrite);
 | 
			
		||||
	thread_foreach(ss,Utmp_v,{
 | 
			
		||||
	    Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
 | 
			
		||||
        });
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
      
 | 
			
		||||
@@ -272,11 +280,14 @@ public:
 | 
			
		||||
    GaugeLinkField link(mat.Grid());
 | 
			
		||||
    // use lorentz for flavour as hack.
 | 
			
		||||
    auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
 | 
			
		||||
    auto link_v = link.View();
 | 
			
		||||
    auto tmp_v = tmp.View();
 | 
			
		||||
    thread_foreach(ss,tmp_v,{
 | 
			
		||||
      link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    {
 | 
			
		||||
      autoView( link_v , link, CpuWrite);
 | 
			
		||||
      autoView( tmp_v , tmp, CpuRead);
 | 
			
		||||
      thread_foreach(ss,tmp_v,{
 | 
			
		||||
	  link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
 | 
			
		||||
	});
 | 
			
		||||
    }
 | 
			
		||||
    PokeIndex<LorentzIndex>(mat, link, mu);
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
@@ -306,16 +317,18 @@ public:
 | 
			
		||||
        
 | 
			
		||||
    GaugeLinkField tmp(mat.Grid());
 | 
			
		||||
    tmp = Zero();
 | 
			
		||||
    auto tmp_v = tmp.View();
 | 
			
		||||
    auto Atilde_v = Atilde.View();
 | 
			
		||||
    auto Btilde_v = Btilde.View();
 | 
			
		||||
    thread_for(ss,tmp.Grid()->oSites(),{
 | 
			
		||||
      for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	int sF = s + Ls * ss;
 | 
			
		||||
	auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
 | 
			
		||||
	tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
    {
 | 
			
		||||
      autoView( tmp_v , tmp, CpuWrite);
 | 
			
		||||
      autoView( Atilde_v , Atilde, CpuRead);
 | 
			
		||||
      autoView( Btilde_v , Btilde, CpuRead);
 | 
			
		||||
      thread_for(ss,tmp.Grid()->oSites(),{
 | 
			
		||||
	  for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	    int sF = s + Ls * ss;
 | 
			
		||||
	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
 | 
			
		||||
	    tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
 | 
			
		||||
	  }
 | 
			
		||||
	});
 | 
			
		||||
    }
 | 
			
		||||
    PokeIndex<LorentzIndex>(mat, tmp, mu);
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -61,8 +61,8 @@ public:
 | 
			
		||||
  double DhopCalls;
 | 
			
		||||
  double DhopCommTime;
 | 
			
		||||
  double DhopComputeTime;
 | 
			
		||||
      double DhopComputeTime2;
 | 
			
		||||
      double DhopFaceTime;
 | 
			
		||||
  double DhopComputeTime2;
 | 
			
		||||
  double DhopFaceTime;
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  // Implement the abstract base
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										194
									
								
								Grid/qcd/action/fermion/NaiveStaggeredFermion.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										194
									
								
								Grid/qcd/action/fermion/NaiveStaggeredFermion.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,194 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi, Peter Boyle
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution
 | 
			
		||||
directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
			   /*  END LEGAL */
 | 
			
		||||
#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
 | 
			
		||||
#define GRID_QCD_NAIVE_STAG_FERMION_H
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
class NaiveStaggeredFermionStatic {
 | 
			
		||||
public:
 | 
			
		||||
  static const std::vector<int> directions;
 | 
			
		||||
  static const std::vector<int> displacements;
 | 
			
		||||
  static const int npoint = 8;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
 | 
			
		||||
public:
 | 
			
		||||
  INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
  typedef StaggeredKernels<Impl> Kernels;
 | 
			
		||||
 | 
			
		||||
  FermionField _tmp;
 | 
			
		||||
  FermionField &tmp(void) { return _tmp; }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////
 | 
			
		||||
  // Performance monitoring
 | 
			
		||||
  ////////////////////////////////////////
 | 
			
		||||
  void Report(void);
 | 
			
		||||
  void ZeroCounters(void);
 | 
			
		||||
  double DhopTotalTime;
 | 
			
		||||
  double DhopCalls;
 | 
			
		||||
  double DhopCommTime;
 | 
			
		||||
  double DhopComputeTime;
 | 
			
		||||
  double DhopComputeTime2;
 | 
			
		||||
  double DhopFaceTime;
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  // Implement the abstract base
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  GridBase *GaugeGrid(void) { return _grid; }
 | 
			
		||||
  GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
 | 
			
		||||
  GridBase *FermionGrid(void) { return _grid; }
 | 
			
		||||
  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////
 | 
			
		||||
  // override multiply; cut number routines if pass dagger argument
 | 
			
		||||
  // and also make interface more uniformly consistent
 | 
			
		||||
  //////////////////////////////////////////////////////////////////
 | 
			
		||||
  void M(const FermionField &in, FermionField &out);
 | 
			
		||||
  void Mdag(const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////
 | 
			
		||||
  // half checkerboard operations
 | 
			
		||||
  /////////////////////////////////////////////////////////
 | 
			
		||||
  void Meooe(const FermionField &in, FermionField &out);
 | 
			
		||||
  void MeooeDag(const FermionField &in, FermionField &out);
 | 
			
		||||
  void Mooee(const FermionField &in, FermionField &out);
 | 
			
		||||
  void MooeeDag(const FermionField &in, FermionField &out);
 | 
			
		||||
  void MooeeInv(const FermionField &in, FermionField &out);
 | 
			
		||||
  void MooeeInvDag(const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////
 | 
			
		||||
  // Derivative interface
 | 
			
		||||
  ////////////////////////
 | 
			
		||||
  // Interface calls an internal routine
 | 
			
		||||
  void DhopDeriv  (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
 | 
			
		||||
  void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
 | 
			
		||||
  void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  // non-hermitian hopping term; half cb or both
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  void Dhop  (const FermionField &in, FermionField &out, int dag);
 | 
			
		||||
  void DhopOE(const FermionField &in, FermionField &out, int dag);
 | 
			
		||||
  void DhopEO(const FermionField &in, FermionField &out, int dag);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  // Multigrid assistance; force term uses too
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
 | 
			
		||||
  void MdirAll(const FermionField &in, std::vector<FermionField> &out);
 | 
			
		||||
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  // Extra methods added by derived
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  void DerivInternal(StencilImpl &st, 
 | 
			
		||||
		     DoubledGaugeField &U,
 | 
			
		||||
		     GaugeField &mat, 
 | 
			
		||||
		     const FermionField &A, const FermionField &B, int dag);
 | 
			
		||||
 | 
			
		||||
  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
                    const FermionField &in, FermionField &out, int dag);
 | 
			
		||||
  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
			       const FermionField &in, FermionField &out, int dag);
 | 
			
		||||
  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
				   const FermionField &in, FermionField &out, int dag);
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Grid own interface Constructor
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
 | 
			
		||||
			GridRedBlackCartesian &Hgrid, RealD _mass,
 | 
			
		||||
			RealD _c1, RealD _u0,
 | 
			
		||||
			const ImplParams &p = ImplParams());
 | 
			
		||||
  NaiveStaggeredFermion(GridCartesian &Fgrid,
 | 
			
		||||
			GridRedBlackCartesian &Hgrid, RealD _mass,
 | 
			
		||||
			RealD _c1, RealD _u0,
 | 
			
		||||
			const ImplParams &p = ImplParams());
 | 
			
		||||
 | 
			
		||||
  // DoubleStore impl dependent
 | 
			
		||||
  void ImportGauge      (const GaugeField &_U );
 | 
			
		||||
  DoubledGaugeField &GetU(void)   { return Umu ; } ;
 | 
			
		||||
  void CopyGaugeCheckerboards(void);
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  // Data members require to support the functionality
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  //    protected:
 | 
			
		||||
public:
 | 
			
		||||
  // any other parameters of action ???
 | 
			
		||||
  virtual int   isTrivialEE(void) { return 1; };
 | 
			
		||||
  virtual RealD Mass(void) { return mass; }
 | 
			
		||||
  RealD mass;
 | 
			
		||||
  RealD u0;
 | 
			
		||||
  RealD c1;
 | 
			
		||||
 | 
			
		||||
  GridBase *_grid;
 | 
			
		||||
  GridBase *_cbgrid;
 | 
			
		||||
 | 
			
		||||
  // Defines the stencils for even and odd
 | 
			
		||||
  StencilImpl Stencil;
 | 
			
		||||
  StencilImpl StencilEven;
 | 
			
		||||
  StencilImpl StencilOdd;
 | 
			
		||||
 | 
			
		||||
  // Copy of the gauge field , with even and odd subsets
 | 
			
		||||
  DoubledGaugeField Umu;
 | 
			
		||||
  DoubledGaugeField UmuEven;
 | 
			
		||||
  DoubledGaugeField UmuOdd;
 | 
			
		||||
 | 
			
		||||
  LebesgueOrder Lebesgue;
 | 
			
		||||
  LebesgueOrder LebesgueEvenOdd;
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  // Conserved current utilities
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  void ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
                                PropagatorField &q_in_2,
 | 
			
		||||
                                PropagatorField &q_out,
 | 
			
		||||
                                PropagatorField &src,
 | 
			
		||||
                                Current curr_type,
 | 
			
		||||
                                unsigned int mu);
 | 
			
		||||
  void SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
                           PropagatorField &q_out,
 | 
			
		||||
                           PropagatorField &srct,
 | 
			
		||||
                           Current curr_type,
 | 
			
		||||
                           unsigned int mu, 
 | 
			
		||||
                           unsigned int tmin,
 | 
			
		||||
                           unsigned int tmax,
 | 
			
		||||
			   ComplexField &lattice_cmplx);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
 | 
			
		||||
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -47,23 +47,34 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
 | 
			
		||||
  INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
  typedef FermionOperator<Impl> Base;
 | 
			
		||||
   
 | 
			
		||||
public:
 | 
			
		||||
    
 | 
			
		||||
   void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 | 
			
		||||
		      int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
 | 
			
		||||
 public:
 | 
			
		||||
 | 
			
		||||
  void DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
 | 
			
		||||
		    const FermionField &in, FermionField &out, int dag, int interior,int exterior);
 | 
			
		||||
  void DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
		 DoubledGaugeField &U,
 | 
			
		||||
		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
 | 
			
		||||
  
 | 
			
		||||
  void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 | 
			
		||||
		     int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
 | 
			
		||||
 protected:    
 | 
			
		||||
 | 
			
		||||
   ///////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
   // Generic Nc kernels
 | 
			
		||||
   ///////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
   template<int Naik> accelerator_inline
 | 
			
		||||
   void DhopSiteGeneric(StencilView &st, 
 | 
			
		||||
			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 | 
			
		||||
			SiteSpinor * buf, int LLs, int sU, 
 | 
			
		||||
			const FermionFieldView &in, FermionFieldView &out,int dag);
 | 
			
		||||
   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
   template<int Naik> accelerator_inline
 | 
			
		||||
   void DhopSiteGenericInt(StencilView &st, 
 | 
			
		||||
			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 | 
			
		||||
			   SiteSpinor * buf, int LLs, int sU, 
 | 
			
		||||
			   const FermionFieldView &in, FermionFieldView &out,int dag);
 | 
			
		||||
   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
   template<int Naik> accelerator_inline
 | 
			
		||||
   void DhopSiteGenericExt(StencilView &st, 
 | 
			
		||||
			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 | 
			
		||||
			   SiteSpinor * buf, int LLs, int sU, 
 | 
			
		||||
			   const FermionFieldView &in, FermionFieldView &out,int dag);
 | 
			
		||||
@@ -71,15 +82,18 @@ public:
 | 
			
		||||
   ///////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
   // Nc=3 specific kernels
 | 
			
		||||
   ///////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
   template<int Naik> accelerator_inline
 | 
			
		||||
   void DhopSiteHand(StencilView &st, 
 | 
			
		||||
		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 | 
			
		||||
		     SiteSpinor * buf, int LLs, int sU, 
 | 
			
		||||
		     const FermionFieldView &in, FermionFieldView &out,int dag);
 | 
			
		||||
   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
   template<int Naik> accelerator_inline
 | 
			
		||||
   void DhopSiteHandInt(StencilView &st, 
 | 
			
		||||
			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 | 
			
		||||
			SiteSpinor * buf, int LLs, int sU, 
 | 
			
		||||
			const FermionFieldView &in, FermionFieldView &out,int dag);
 | 
			
		||||
   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
   template<int Naik> accelerator_inline
 | 
			
		||||
   void DhopSiteHandExt(StencilView &st, 
 | 
			
		||||
			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 | 
			
		||||
			SiteSpinor * buf, int LLs, int sU, 
 | 
			
		||||
			const FermionFieldView &in, FermionFieldView &out,int dag);
 | 
			
		||||
@@ -87,27 +101,10 @@ public:
 | 
			
		||||
   ///////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
   // Asm Nc=3 specific kernels
 | 
			
		||||
   ///////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
   void DhopSiteAsm(StencilView &st, 
 | 
			
		||||
		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 | 
			
		||||
		    SiteSpinor * buf, int LLs, int sU, 
 | 
			
		||||
		    const FermionFieldView &in, FermionFieldView &out,int dag);
 | 
			
		||||
   ///////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
   // Generic interface; fan out to right routine
 | 
			
		||||
   ///////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 | 
			
		||||
		 SiteSpinor * buf, int LLs, int sU,
 | 
			
		||||
		 const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
 | 
			
		||||
 | 
			
		||||
   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
		    DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 | 
			
		||||
		    SiteSpinor * buf, int LLs, int sU,
 | 
			
		||||
		    const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
 | 
			
		||||
 | 
			
		||||
   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 | 
			
		||||
		 SiteSpinor * buf, int LLs, int sU,
 | 
			
		||||
		 const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
 | 
			
		||||
  
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -113,20 +113,7 @@ public:
 | 
			
		||||
      
 | 
			
		||||
  inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
 | 
			
		||||
  {
 | 
			
		||||
    GridBase *GaugeGrid = U_ds.Grid();
 | 
			
		||||
    thread_for(lidx, GaugeGrid->lSites(),{
 | 
			
		||||
 | 
			
		||||
	SiteScalarGaugeLink   ScalarU;
 | 
			
		||||
	SiteDoubledGaugeField ScalarUds;
 | 
			
		||||
	
 | 
			
		||||
	Coordinate lcoor;
 | 
			
		||||
	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
 | 
			
		||||
	peekLocalSite(ScalarUds, U_ds, lcoor);
 | 
			
		||||
	
 | 
			
		||||
	peekLocalSite(ScalarU, U, lcoor);
 | 
			
		||||
	ScalarUds(mu) = ScalarU();
 | 
			
		||||
	
 | 
			
		||||
    });
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
  inline void DoubleStore(GridBase *GaugeGrid,
 | 
			
		||||
			  DoubledGaugeField &UUUds, // for Naik term
 | 
			
		||||
 
 | 
			
		||||
@@ -257,15 +257,16 @@ private:
 | 
			
		||||
  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
 | 
			
		||||
  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
 | 
			
		||||
 | 
			
		||||
 public:
 | 
			
		||||
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
 | 
			
		||||
  // using the DeGrand-Rossi basis for the gamma matrices
 | 
			
		||||
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
 | 
			
		||||
  {
 | 
			
		||||
    CloverFieldType T(F.Grid());
 | 
			
		||||
    T = Zero();
 | 
			
		||||
    auto T_v = T.View();
 | 
			
		||||
    auto F_v = F.View();
 | 
			
		||||
    thread_for(i, CloverTerm.Grid()->oSites(),
 | 
			
		||||
    autoView(T_v,T,AcceleratorWrite);
 | 
			
		||||
    autoView(F_v,F,AcceleratorRead);
 | 
			
		||||
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
 | 
			
		||||
    {
 | 
			
		||||
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
 | 
			
		||||
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
 | 
			
		||||
@@ -281,9 +282,9 @@ private:
 | 
			
		||||
    CloverFieldType T(F.Grid());
 | 
			
		||||
    T = Zero();
 | 
			
		||||
    
 | 
			
		||||
    auto T_v = T.View();
 | 
			
		||||
    auto F_v = F.View();
 | 
			
		||||
    thread_for(i, CloverTerm.Grid()->oSites(),
 | 
			
		||||
    autoView(T_v, T,AcceleratorWrite);
 | 
			
		||||
    autoView(F_v, F,AcceleratorRead);
 | 
			
		||||
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
 | 
			
		||||
    {
 | 
			
		||||
      T_v[i]()(0, 1) = -F_v[i]()();
 | 
			
		||||
      T_v[i]()(1, 0) = F_v[i]()();
 | 
			
		||||
@@ -299,9 +300,9 @@ private:
 | 
			
		||||
    CloverFieldType T(F.Grid());
 | 
			
		||||
    T = Zero();
 | 
			
		||||
 | 
			
		||||
    auto T_v = T.View();
 | 
			
		||||
    auto F_v = F.View();
 | 
			
		||||
    thread_for(i, CloverTerm.Grid()->oSites(),
 | 
			
		||||
    autoView(T_v,T,AcceleratorWrite);
 | 
			
		||||
    autoView(F_v,F,AcceleratorRead);
 | 
			
		||||
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
 | 
			
		||||
    {
 | 
			
		||||
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
 | 
			
		||||
      T_v[i]()(1, 1) = timesI(F_v[i]()());
 | 
			
		||||
@@ -317,9 +318,9 @@ private:
 | 
			
		||||
    CloverFieldType T(F.Grid());
 | 
			
		||||
    T = Zero();
 | 
			
		||||
 | 
			
		||||
    auto T_v = T.View();
 | 
			
		||||
    auto F_v = F.View();
 | 
			
		||||
    thread_for(i, CloverTerm.Grid()->oSites(),
 | 
			
		||||
    autoView( T_v , T, AcceleratorWrite);
 | 
			
		||||
    autoView( F_v , F, AcceleratorRead);
 | 
			
		||||
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
 | 
			
		||||
    {
 | 
			
		||||
      T_v[i]()(0, 1) = timesI(F_v[i]()());
 | 
			
		||||
      T_v[i]()(1, 0) = timesI(F_v[i]()());
 | 
			
		||||
@@ -335,9 +336,9 @@ private:
 | 
			
		||||
    CloverFieldType T(F.Grid());
 | 
			
		||||
    T = Zero();
 | 
			
		||||
    
 | 
			
		||||
    auto T_v = T.View();
 | 
			
		||||
    auto F_v = F.View();
 | 
			
		||||
    thread_for(i, CloverTerm.Grid()->oSites(),
 | 
			
		||||
    autoView( T_v ,T,AcceleratorWrite);
 | 
			
		||||
    autoView( F_v ,F,AcceleratorRead);
 | 
			
		||||
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
 | 
			
		||||
    {
 | 
			
		||||
      T_v[i]()(0, 1) = -(F_v[i]()());
 | 
			
		||||
      T_v[i]()(1, 0) = (F_v[i]()());
 | 
			
		||||
@@ -354,9 +355,9 @@ private:
 | 
			
		||||
 | 
			
		||||
    T = Zero();
 | 
			
		||||
 | 
			
		||||
    auto T_v = T.View();
 | 
			
		||||
    auto F_v = F.View();
 | 
			
		||||
    thread_for(i, CloverTerm.Grid()->oSites(),
 | 
			
		||||
    autoView( T_v , T,AcceleratorWrite);
 | 
			
		||||
    autoView( F_v , F,AcceleratorRead);
 | 
			
		||||
    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
 | 
			
		||||
    {
 | 
			
		||||
      T_v[i]()(0, 0) = timesI(F_v[i]()());
 | 
			
		||||
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
 | 
			
		||||
 
 | 
			
		||||
@@ -106,10 +106,10 @@ public:
 | 
			
		||||
			    const _SpinorField & phi,
 | 
			
		||||
			    int mu)
 | 
			
		||||
  {
 | 
			
		||||
    auto out_v= out.View();
 | 
			
		||||
    auto phi_v= phi.View();
 | 
			
		||||
    auto Umu_v= Umu.View();
 | 
			
		||||
    thread_for(sss,out.Grid()->oSites(),{
 | 
			
		||||
    autoView( out_v, out, AcceleratorWrite);
 | 
			
		||||
    autoView( phi_v, phi, AcceleratorRead);
 | 
			
		||||
    autoView( Umu_v, Umu, AcceleratorRead);
 | 
			
		||||
    accelerator_for(sss,out.Grid()->oSites(),1,{
 | 
			
		||||
	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
@@ -191,18 +191,19 @@ public:
 | 
			
		||||
    int Ls=Btilde.Grid()->_fdimensions[0];
 | 
			
		||||
    GaugeLinkField tmp(mat.Grid());
 | 
			
		||||
    tmp = Zero();
 | 
			
		||||
    auto tmp_v = tmp.View();
 | 
			
		||||
    auto Btilde_v = Btilde.View();
 | 
			
		||||
    auto Atilde_v = Atilde.View();
 | 
			
		||||
    thread_for(sss,tmp.Grid()->oSites(),{
 | 
			
		||||
      int sU=sss;
 | 
			
		||||
      for(int s=0;s<Ls;s++){
 | 
			
		||||
	int sF = s+Ls*sU;
 | 
			
		||||
	tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
    {
 | 
			
		||||
      autoView( tmp_v , tmp, AcceleratorWrite);
 | 
			
		||||
      autoView( Btilde_v , Btilde, AcceleratorRead);
 | 
			
		||||
      autoView( Atilde_v , Atilde, AcceleratorRead);
 | 
			
		||||
      accelerator_for(sss,tmp.Grid()->oSites(),1,{
 | 
			
		||||
	  int sU=sss;
 | 
			
		||||
	  for(int s=0;s<Ls;s++){
 | 
			
		||||
	    int sF = s+Ls*sU;
 | 
			
		||||
	    tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
 | 
			
		||||
	  }
 | 
			
		||||
	});
 | 
			
		||||
    }
 | 
			
		||||
    PokeIndex<LorentzIndex>(mat,tmp,mu);
 | 
			
		||||
      
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 | 
			
		||||
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
    RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
 | 
			
		||||
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 | 
			
		||||
@@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 | 
			
		||||
						      Current curr_type,
 | 
			
		||||
						      unsigned int mu)
 | 
			
		||||
{
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
 | 
			
		||||
  Gamma::Algebra Gmu [] = {
 | 
			
		||||
    Gamma::Algebra::GammaX,
 | 
			
		||||
    Gamma::Algebra::GammaY,
 | 
			
		||||
@@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
 | 
			
		||||
  int tshift = (mu == Nd-1) ? 1 : 0;
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  // GENERAL CAYLEY CASE
 | 
			
		||||
 
 | 
			
		||||
@@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
 | 
			
		||||
  
 | 
			
		||||
  chi_i.Checkerboard()=psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid=psi_i.Grid();
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto phi = phi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i,AcceleratorRead);
 | 
			
		||||
  autoView(phi , phi_i,AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i,AcceleratorWrite);
 | 
			
		||||
  assert(phi.Checkerboard() == psi.Checkerboard());
 | 
			
		||||
 | 
			
		||||
  auto pdiag = &diag[0];
 | 
			
		||||
@@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
 | 
			
		||||
{
 | 
			
		||||
  chi_i.Checkerboard()=psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid=psi_i.Grid();
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto phi = phi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i,AcceleratorRead);
 | 
			
		||||
  autoView(phi , phi_i,AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i,AcceleratorWrite);
 | 
			
		||||
  assert(phi.Checkerboard() == psi.Checkerboard());
 | 
			
		||||
 | 
			
		||||
  auto pdiag = &diag[0];
 | 
			
		||||
@@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
 | 
			
		||||
  chi_i.Checkerboard()=psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid=psi_i.Grid();
 | 
			
		||||
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i,AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i,AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
 | 
			
		||||
@@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
 | 
			
		||||
  GridBase *grid=psi_i.Grid();
 | 
			
		||||
  int Ls=this->Ls;
 | 
			
		||||
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i,AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i,AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  auto plee  = & lee [0];
 | 
			
		||||
  auto pdee  = & dee [0];
 | 
			
		||||
 
 | 
			
		||||
@@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
 | 
			
		||||
  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
 | 
			
		||||
  chi_i.Checkerboard()=psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid=psi_i.Grid();
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto phi = phi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi, psi_i,CpuRead);
 | 
			
		||||
  autoView(phi, phi_i,CpuRead);
 | 
			
		||||
  autoView(chi, chi_i,CpuWrite);
 | 
			
		||||
  int Ls   = this->Ls;
 | 
			
		||||
  int LLs  = grid->_rdimensions[0];
 | 
			
		||||
  const int nsimd= Simd::Nsimd();
 | 
			
		||||
@@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
 | 
			
		||||
  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
 | 
			
		||||
  chi_i.Checkerboard()=psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid=psi_i.Grid();
 | 
			
		||||
  auto psi=psi_i.View();
 | 
			
		||||
  auto phi=phi_i.View();
 | 
			
		||||
  auto chi=chi_i.View();
 | 
			
		||||
  autoView(psi,psi_i,CpuRead);
 | 
			
		||||
  autoView(phi,phi_i,CpuRead);
 | 
			
		||||
  autoView(chi,chi_i,CpuWrite);
 | 
			
		||||
  int Ls   = this->Ls;
 | 
			
		||||
  int LLs  = grid->_rdimensions[0];
 | 
			
		||||
  int nsimd= Simd::Nsimd();
 | 
			
		||||
@@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
 | 
			
		||||
					Vector<iSinglet<Simd> > &Matm)
 | 
			
		||||
{
 | 
			
		||||
  EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i,CpuRead);
 | 
			
		||||
  autoView(chi , chi_i,CpuWrite);
 | 
			
		||||
#ifndef AVX512
 | 
			
		||||
  {
 | 
			
		||||
    SiteHalfSpinor BcastP;
 | 
			
		||||
@@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
 | 
			
		||||
  EnableIf<Impl::LsVectorised,int> sfinae=0;
 | 
			
		||||
#ifndef AVX512
 | 
			
		||||
  {
 | 
			
		||||
    auto psi = psi_i.View();
 | 
			
		||||
    auto chi = chi_i.View();
 | 
			
		||||
    autoView(psi , psi_i,CpuRead);
 | 
			
		||||
    autoView(chi , chi_i,CpuWrite);
 | 
			
		||||
 | 
			
		||||
    SiteHalfSpinor BcastP;
 | 
			
		||||
    SiteHalfSpinor BcastM;
 | 
			
		||||
@@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  {
 | 
			
		||||
    auto psi = psi_i.View();
 | 
			
		||||
    auto chi = chi_i.View();
 | 
			
		||||
    autoView(psi , psi_i,CpuRead);
 | 
			
		||||
    autoView(chi , chi_i,CpuWrite);
 | 
			
		||||
    // pointers
 | 
			
		||||
    //  MASK_REGS;
 | 
			
		||||
#define Chi_00 %zmm0
 | 
			
		||||
 
 | 
			
		||||
@@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
 | 
			
		||||
  chi_i.Checkerboard() = psi_i.Checkerboard();
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
  GridBase* grid = psi_i.Grid();
 | 
			
		||||
  auto phi = phi_i.View();
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView( phi , phi_i, AcceleratorRead);
 | 
			
		||||
  autoView( psi , psi_i, AcceleratorRead);
 | 
			
		||||
  autoView( chi , chi_i, AcceleratorWrite);
 | 
			
		||||
  assert(phi.Checkerboard() == psi.Checkerboard());
 | 
			
		||||
  auto pdiag = &diag[0];
 | 
			
		||||
  auto pupper = &upper[0];
 | 
			
		||||
@@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
 | 
			
		||||
  GridBase* grid = psi_i.Grid();
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto phi = phi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView( psi , psi_i, AcceleratorRead);
 | 
			
		||||
  autoView( phi , phi_i, AcceleratorRead);
 | 
			
		||||
  autoView( chi , chi_i, AcceleratorWrite);
 | 
			
		||||
  assert(phi.Checkerboard() == psi.Checkerboard());
 | 
			
		||||
  auto pdiag = &diag[0];
 | 
			
		||||
  auto pupper = &upper[0];
 | 
			
		||||
@@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
 | 
			
		||||
{
 | 
			
		||||
  chi_i.Checkerboard() = psi_i.Checkerboard();
 | 
			
		||||
  GridBase* grid = psi_i.Grid();
 | 
			
		||||
  auto psi=psi_i.View();
 | 
			
		||||
  auto chi=chi_i.View();
 | 
			
		||||
  autoView( psi, psi_i, AcceleratorRead);
 | 
			
		||||
  autoView( chi, chi_i, AcceleratorWrite);
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
  auto plee  = & this->lee[0];
 | 
			
		||||
@@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
 | 
			
		||||
{
 | 
			
		||||
  chi_i.Checkerboard() = psi_i.Checkerboard();
 | 
			
		||||
  GridBase* grid = psi_i.Grid();
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView( psi, psi_i, AcceleratorRead);
 | 
			
		||||
  autoView( chi, chi_i, AcceleratorWrite);
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
  auto plee  = & this->lee[0];
 | 
			
		||||
 
 | 
			
		||||
@@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
 | 
			
		||||
 | 
			
		||||
  Compressor compressor;
 | 
			
		||||
  Stencil.HaloExchange(in,compressor);
 | 
			
		||||
  auto Umu_v   = Umu.View();
 | 
			
		||||
  auto UUUmu_v = UUUmu.View();
 | 
			
		||||
  auto in_v    = in.View();
 | 
			
		||||
  auto out_v   = out.View();
 | 
			
		||||
  autoView( Umu_v   ,   Umu, CpuRead);
 | 
			
		||||
  autoView( UUUmu_v , UUUmu, CpuRead);
 | 
			
		||||
  autoView( in_v    ,  in, CpuRead);
 | 
			
		||||
  autoView( out_v   , out, CpuWrite);
 | 
			
		||||
  thread_for( ss,Umu.Grid()->oSites(),{
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
      int sU=ss;
 | 
			
		||||
@@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 | 
			
		||||
						    DoubledGaugeField & U,DoubledGaugeField & UUU,
 | 
			
		||||
						    const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
 | 
			
		||||
    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
 | 
			
		||||
  else
 | 
			
		||||
#endif
 | 
			
		||||
    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
 | 
			
		||||
								   DoubledGaugeField & U,DoubledGaugeField & UUU,
 | 
			
		||||
								   const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
 | 
			
		||||
  Compressor compressor; 
 | 
			
		||||
 | 
			
		||||
  int LLs = in.Grid()->_rdimensions[0];
 | 
			
		||||
@@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.Prepare();
 | 
			
		||||
  st.HaloGather(in,compressor);
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  DhopCommTime -=usecond();
 | 
			
		||||
  std::vector<std::vector<CommsRequest_t> > requests;
 | 
			
		||||
  st.CommunicateBegin(requests);
 | 
			
		||||
 | 
			
		||||
  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  double ctime=0;
 | 
			
		||||
  double ptime=0;
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Ugly explicit thread mapping introduced for OPA reasons.
 | 
			
		||||
  // Remove explicit thread mapping introduced for OPA reasons.
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
 | 
			
		||||
  DhopComputeTime-=usecond();
 | 
			
		||||
  {
 | 
			
		||||
    int tid = omp_get_thread_num();
 | 
			
		||||
    int nthreads = omp_get_num_threads();
 | 
			
		||||
    int ncomms = CartesianCommunicator::nCommThreads;
 | 
			
		||||
    if (ncomms == -1) ncomms = 1;
 | 
			
		||||
    assert(nthreads > ncomms);
 | 
			
		||||
    if (tid >= ncomms) {
 | 
			
		||||
      double start = usecond();
 | 
			
		||||
      nthreads -= ncomms;
 | 
			
		||||
      int ttid  = tid - ncomms;
 | 
			
		||||
      int n     = U.Grid()->oSites(); // 4d vol
 | 
			
		||||
      int chunk = n / nthreads;
 | 
			
		||||
      int rem   = n % nthreads;
 | 
			
		||||
      int myblock, myn;
 | 
			
		||||
      if (ttid < rem) {
 | 
			
		||||
        myblock = ttid * chunk + ttid;
 | 
			
		||||
        myn = chunk+1;
 | 
			
		||||
      } else {
 | 
			
		||||
        myblock = ttid*chunk + rem;
 | 
			
		||||
        myn = chunk;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // do the compute
 | 
			
		||||
      auto   U_v  =   U.View();
 | 
			
		||||
      auto UUU_v  = UUU.View();
 | 
			
		||||
      auto  in_v  =  in.View();
 | 
			
		||||
      auto out_v  = out.View();
 | 
			
		||||
 | 
			
		||||
      if (dag == DaggerYes) {
 | 
			
		||||
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 | 
			
		||||
          int sU = ss;
 | 
			
		||||
	  // Interior = 1; Exterior = 0; must implement for staggered
 | 
			
		||||
          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
 | 
			
		||||
        }
 | 
			
		||||
      } else {
 | 
			
		||||
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 | 
			
		||||
	  // Interior = 1; Exterior = 0;
 | 
			
		||||
          int sU = ss;
 | 
			
		||||
          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
        ptime = usecond() - start;
 | 
			
		||||
    } else {
 | 
			
		||||
      double start = usecond();
 | 
			
		||||
      st.CommunicateThreaded();
 | 
			
		||||
      ctime = usecond() - start;
 | 
			
		||||
    }
 | 
			
		||||
    int interior=1;
 | 
			
		||||
    int exterior=0;
 | 
			
		||||
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
 | 
			
		||||
  }
 | 
			
		||||
  DhopCommTime += ctime;
 | 
			
		||||
  DhopComputeTime+=ptime;
 | 
			
		||||
 | 
			
		||||
  // First to enter, last to leave timing
 | 
			
		||||
  st.CollateThreads();
 | 
			
		||||
  DhopComputeTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMerge(compressor);
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  DhopComputeTime2-=usecond();
 | 
			
		||||
  st.CommunicateComplete(requests);
 | 
			
		||||
  DhopCommTime +=usecond();
 | 
			
		||||
 | 
			
		||||
  auto   U_v  =   U.View();
 | 
			
		||||
  auto UUU_v  = UUU.View();
 | 
			
		||||
  auto  in_v  =  in.View();
 | 
			
		||||
  auto out_v  = out.View();
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    int sz=st.surface_list.size();
 | 
			
		||||
    thread_for( ss,sz,{
 | 
			
		||||
      int sU = st.surface_list[ss];
 | 
			
		||||
      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
 | 
			
		||||
    });
 | 
			
		||||
  } else {
 | 
			
		||||
    int sz=st.surface_list.size();
 | 
			
		||||
    thread_for( ss,sz,{
 | 
			
		||||
      int sU = st.surface_list[ss];
 | 
			
		||||
      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
 | 
			
		||||
    });
 | 
			
		||||
  DhopComputeTime2-=usecond();
 | 
			
		||||
  {
 | 
			
		||||
    int interior=0;
 | 
			
		||||
    int exterior=1;
 | 
			
		||||
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime2+=usecond();
 | 
			
		||||
#else
 | 
			
		||||
  assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 | 
			
		||||
  Compressor compressor;
 | 
			
		||||
  int LLs = in.Grid()->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 //double t1=usecond();
 | 
			
		||||
  DhopTotalTime -= usecond();
 | 
			
		||||
  DhopCommTime -= usecond();
 | 
			
		||||
@@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 | 
			
		||||
  
 | 
			
		||||
  DhopComputeTime -= usecond();
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  auto   U_v  =   U.View();
 | 
			
		||||
  auto UUU_v  = UUU.View();
 | 
			
		||||
  auto  in_v  =  in.View();
 | 
			
		||||
  auto out_v  = out.View();
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    thread_for( ss,U.Grid()->oSites(),{
 | 
			
		||||
      int sU=ss;
 | 
			
		||||
      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
 | 
			
		||||
    });
 | 
			
		||||
  } else {
 | 
			
		||||
    thread_for( ss,U.Grid()->oSites(),{
 | 
			
		||||
      int sU=ss;
 | 
			
		||||
      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
 | 
			
		||||
    });
 | 
			
		||||
  {
 | 
			
		||||
    int interior=1;
 | 
			
		||||
    int exterior=1;
 | 
			
		||||
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime += usecond();
 | 
			
		||||
  DhopTotalTime   += usecond();
 | 
			
		||||
 //double t2=usecond();
 | 
			
		||||
 //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl;
 | 
			
		||||
 //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl;
 | 
			
		||||
 //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl;
 | 
			
		||||
 //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
/*CHANGE END*/
 | 
			
		||||
 
 | 
			
		||||
@@ -258,10 +258,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
 | 
			
		||||
    ////////////////////////
 | 
			
		||||
    // Call the single hop
 | 
			
		||||
    ////////////////////////
 | 
			
		||||
    auto U_v   = U.View();
 | 
			
		||||
    auto UUU_v = UUU.View();
 | 
			
		||||
    auto B_v   = B.View();
 | 
			
		||||
    auto Btilde_v   = Btilde.View();
 | 
			
		||||
    autoView( U_v   , U, CpuRead);
 | 
			
		||||
    autoView( UUU_v , UUU, CpuRead);
 | 
			
		||||
    autoView( B_v      , B, CpuWrite);
 | 
			
		||||
    autoView( Btilde_v , Btilde, CpuWrite);
 | 
			
		||||
    thread_for(sss,B.Grid()->oSites(),{
 | 
			
		||||
      Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
 | 
			
		||||
    });
 | 
			
		||||
@@ -386,10 +386,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
 | 
			
		||||
 | 
			
		||||
  Compressor compressor;
 | 
			
		||||
  Stencil.HaloExchange(in, compressor);
 | 
			
		||||
  auto Umu_v   =   Umu.View();
 | 
			
		||||
  auto UUUmu_v = UUUmu.View();
 | 
			
		||||
  auto in_v    =  in.View();
 | 
			
		||||
  auto out_v   = out.View();
 | 
			
		||||
  autoView( Umu_v   ,   Umu, CpuRead);
 | 
			
		||||
  autoView( UUUmu_v , UUUmu, CpuRead);
 | 
			
		||||
  autoView( in_v    ,  in, CpuRead);
 | 
			
		||||
  autoView( out_v   , out, CpuWrite);
 | 
			
		||||
  thread_for( sss, in.Grid()->oSites(),{
 | 
			
		||||
    Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
 | 
			
		||||
  });
 | 
			
		||||
@@ -403,11 +403,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
 | 
			
		||||
						  const FermionField &in,
 | 
			
		||||
						  FermionField &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
 | 
			
		||||
    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
 | 
			
		||||
  else
 | 
			
		||||
#endif
 | 
			
		||||
    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
 | 
			
		||||
}
 | 
			
		||||
template <class Impl>
 | 
			
		||||
@@ -417,7 +415,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 | 
			
		||||
								 const FermionField &in,
 | 
			
		||||
								 FermionField &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  Compressor compressor; 
 | 
			
		||||
  int len =  U.Grid()->oSites();
 | 
			
		||||
 | 
			
		||||
@@ -426,60 +423,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 | 
			
		||||
  DhopFaceTime    -= usecond();
 | 
			
		||||
  st.Prepare();
 | 
			
		||||
  st.HaloGather(in,compressor);
 | 
			
		||||
  st.CommsMergeSHM(compressor);
 | 
			
		||||
  DhopFaceTime    += usecond();
 | 
			
		||||
 | 
			
		||||
  DhopCommTime -=usecond();
 | 
			
		||||
  std::vector<std::vector<CommsRequest_t> > requests;
 | 
			
		||||
  st.CommunicateBegin(requests);
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMergeSHM(compressor);
 | 
			
		||||
  DhopFaceTime+= usecond();
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Ugly explicit thread mapping introduced for OPA reasons.
 | 
			
		||||
  // Removed explicit thread comms
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  DhopComputeTime    -= usecond();
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
    int tid = omp_get_thread_num();
 | 
			
		||||
    int nthreads = omp_get_num_threads();
 | 
			
		||||
    int ncomms = CartesianCommunicator::nCommThreads;
 | 
			
		||||
    if (ncomms == -1) ncomms = 1;
 | 
			
		||||
    assert(nthreads > ncomms);
 | 
			
		||||
 | 
			
		||||
    if (tid >= ncomms) {
 | 
			
		||||
      nthreads -= ncomms;
 | 
			
		||||
      int ttid  = tid - ncomms;
 | 
			
		||||
      int n     = len;
 | 
			
		||||
      int chunk = n / nthreads;
 | 
			
		||||
      int rem   = n % nthreads;
 | 
			
		||||
      int myblock, myn;
 | 
			
		||||
      if (ttid < rem) {
 | 
			
		||||
        myblock = ttid * chunk + ttid;
 | 
			
		||||
        myn = chunk+1;
 | 
			
		||||
      } else {
 | 
			
		||||
        myblock = ttid*chunk + rem;
 | 
			
		||||
        myn = chunk;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // do the compute
 | 
			
		||||
      auto U_v   = U.View();
 | 
			
		||||
      auto UUU_v = UUU.View();
 | 
			
		||||
      auto in_v  = in.View();
 | 
			
		||||
      auto out_v = out.View();
 | 
			
		||||
      if (dag == DaggerYes) {
 | 
			
		||||
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 | 
			
		||||
          int sU = ss;
 | 
			
		||||
	  // Interior = 1; Exterior = 0; must implement for staggered
 | 
			
		||||
          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
 | 
			
		||||
        }
 | 
			
		||||
      } else {
 | 
			
		||||
        for (int ss = myblock; ss < myblock+myn; ++ss) {
 | 
			
		||||
	  // Interior = 1; Exterior = 0;
 | 
			
		||||
          int sU = ss;
 | 
			
		||||
          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      st.CommunicateThreaded();
 | 
			
		||||
    }
 | 
			
		||||
    int interior=1;
 | 
			
		||||
    int exterior=0;
 | 
			
		||||
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime    += usecond();
 | 
			
		||||
 | 
			
		||||
  st.CommunicateComplete(requests);
 | 
			
		||||
  DhopCommTime +=usecond();
 | 
			
		||||
 | 
			
		||||
  // First to enter, last to leave timing
 | 
			
		||||
  DhopFaceTime    -= usecond();
 | 
			
		||||
  st.CommsMerge(compressor);
 | 
			
		||||
@@ -487,28 +454,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
 | 
			
		||||
 | 
			
		||||
  DhopComputeTime2    -= usecond();
 | 
			
		||||
  {
 | 
			
		||||
    auto U_v   = U.View();
 | 
			
		||||
    auto UUU_v = UUU.View();
 | 
			
		||||
    auto in_v  = in.View();
 | 
			
		||||
    auto out_v = out.View();
 | 
			
		||||
    if (dag == DaggerYes) {
 | 
			
		||||
      int sz=st.surface_list.size();
 | 
			
		||||
      thread_for(ss,sz,{
 | 
			
		||||
	int sU = st.surface_list[ss];
 | 
			
		||||
	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
 | 
			
		||||
      });
 | 
			
		||||
    } else {
 | 
			
		||||
      int sz=st.surface_list.size();
 | 
			
		||||
      thread_for(ss,sz,{
 | 
			
		||||
	int sU = st.surface_list[ss];
 | 
			
		||||
	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
 | 
			
		||||
      });
 | 
			
		||||
    }
 | 
			
		||||
    int interior=0;
 | 
			
		||||
    int exterior=1;
 | 
			
		||||
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime2    += usecond();
 | 
			
		||||
#else
 | 
			
		||||
  assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -528,19 +478,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
 | 
			
		||||
  st.HaloExchange(in, compressor);
 | 
			
		||||
  DhopCommTime    += usecond();
 | 
			
		||||
 | 
			
		||||
  auto U_v   =   U.View();
 | 
			
		||||
  auto UUU_v = UUU.View();
 | 
			
		||||
  auto in_v  =  in.View();
 | 
			
		||||
  auto out_v = out.View();
 | 
			
		||||
  DhopComputeTime -= usecond();
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    thread_for(sss, in.Grid()->oSites(),{
 | 
			
		||||
      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
 | 
			
		||||
    });
 | 
			
		||||
  } else {
 | 
			
		||||
    thread_for(sss, in.Grid()->oSites(),{
 | 
			
		||||
      Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
 | 
			
		||||
    });
 | 
			
		||||
  {
 | 
			
		||||
    int interior=1;
 | 
			
		||||
    int exterior=1;
 | 
			
		||||
    Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime += usecond();
 | 
			
		||||
  DhopTotalTime   += usecond();
 | 
			
		||||
 
 | 
			
		||||
@@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
 | 
			
		||||
  chi_i.Checkerboard() = psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid = psi_i.Grid();
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto phi = phi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i, AcceleratorRead);
 | 
			
		||||
  autoView(phi , phi_i, AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  assert(phi.Checkerboard() == psi.Checkerboard());
 | 
			
		||||
 | 
			
		||||
@@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
 | 
			
		||||
  chi_i.Checkerboard() = psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid = psi_i.Grid();
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto phi = phi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i, AcceleratorRead);
 | 
			
		||||
  autoView(phi , phi_i, AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  auto pm  = this->pm;
 | 
			
		||||
  int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
 | 
			
		||||
@@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
 | 
			
		||||
  chi_i.Checkerboard() = psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid = psi_i.Grid();
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto phi = phi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i, AcceleratorRead);
 | 
			
		||||
  autoView(phi , phi_i, AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  assert(phi.Checkerboard() == psi.Checkerboard());
 | 
			
		||||
 | 
			
		||||
@@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
 | 
			
		||||
  GridBase *grid = psi_i.Grid();
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
  int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto phi = phi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i, AcceleratorRead);
 | 
			
		||||
  autoView(phi , phi_i, AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  assert(phi.Checkerboard() == psi.Checkerboard());
 | 
			
		||||
 | 
			
		||||
@@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
 | 
			
		||||
  chi_i.Checkerboard() = psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid = psi_i.Grid();
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i, AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  auto plee = & this->lee [0];
 | 
			
		||||
  auto pdee = & this->dee [0];
 | 
			
		||||
@@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
 | 
			
		||||
  chi_i.Checkerboard() = psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid = psi_i.Grid();
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i, AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  auto pm = this->pm;
 | 
			
		||||
  auto plee = & this->lee [0];
 | 
			
		||||
@@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
 | 
			
		||||
  chi_i.Checkerboard() = psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid = psi_i.Grid();
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i, AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
  auto plee = & this->lee [0];
 | 
			
		||||
  auto pdee = & this->dee [0];
 | 
			
		||||
@@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
 | 
			
		||||
{
 | 
			
		||||
  chi_i.Checkerboard() = psi_i.Checkerboard();
 | 
			
		||||
  GridBase *grid = psi_i.Grid();
 | 
			
		||||
  auto psi = psi_i.View();
 | 
			
		||||
  auto chi = chi_i.View();
 | 
			
		||||
  autoView(psi , psi_i, AcceleratorRead);
 | 
			
		||||
  autoView(chi , chi_i, AcceleratorWrite);
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
  auto pm = this->pm;
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,499 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi, Peter Boyle
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution
 | 
			
		||||
directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
#pragma once 
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////
 | 
			
		||||
// Constructor and gauge import
 | 
			
		||||
/////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
 | 
			
		||||
						   RealD _mass,
 | 
			
		||||
						   RealD _c1, RealD _u0,
 | 
			
		||||
						   const ImplParams &p)
 | 
			
		||||
  : Kernels(p),
 | 
			
		||||
    _grid(&Fgrid),
 | 
			
		||||
    _cbgrid(&Hgrid),
 | 
			
		||||
    Stencil(&Fgrid, npoint, Even, directions, displacements,p),
 | 
			
		||||
    StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even
 | 
			
		||||
    StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd
 | 
			
		||||
    mass(_mass),
 | 
			
		||||
    Lebesgue(_grid),
 | 
			
		||||
    LebesgueEvenOdd(_cbgrid),
 | 
			
		||||
    Umu(&Fgrid),
 | 
			
		||||
    UmuEven(&Hgrid),
 | 
			
		||||
    UmuOdd(&Hgrid),
 | 
			
		||||
    _tmp(&Hgrid)
 | 
			
		||||
{
 | 
			
		||||
  int vol4;
 | 
			
		||||
  int LLs=1;
 | 
			
		||||
  c1=_c1;
 | 
			
		||||
  u0=_u0;
 | 
			
		||||
  vol4= _grid->oSites();
 | 
			
		||||
  Stencil.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
  vol4= _cbgrid->oSites();
 | 
			
		||||
  StencilEven.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
  StencilOdd.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
 | 
			
		||||
						   GridRedBlackCartesian &Hgrid, RealD _mass,
 | 
			
		||||
						   RealD _c1, RealD _u0,
 | 
			
		||||
						   const ImplParams &p)
 | 
			
		||||
  : NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
 | 
			
		||||
{
 | 
			
		||||
  ImportGauge(_U);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////
 | 
			
		||||
// Momentum space propagator should be 
 | 
			
		||||
// https://arxiv.org/pdf/hep-lat/9712010.pdf
 | 
			
		||||
//
 | 
			
		||||
// mom space action.
 | 
			
		||||
//   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
 | 
			
		||||
//
 | 
			
		||||
// must track through staggered flavour/spin reduction in literature to 
 | 
			
		||||
// turn to free propagator for the one component chi field, a la page 4/5
 | 
			
		||||
// of above link to implmement fourier based solver.
 | 
			
		||||
////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
 | 
			
		||||
{
 | 
			
		||||
  pickCheckerboard(Even, UmuEven,  Umu);
 | 
			
		||||
  pickCheckerboard(Odd,  UmuOdd ,  Umu);
 | 
			
		||||
}
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U) 
 | 
			
		||||
{
 | 
			
		||||
  GaugeLinkField U(GaugeGrid());
 | 
			
		||||
  DoubledGaugeField _UUU(GaugeGrid());
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  // Double Store should take two fields for Naik and one hop separately.
 | 
			
		||||
  // Discard teh Naik as Naive
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  // Apply scale factors to get the right fermion Kinetic term
 | 
			
		||||
  // Could pass coeffs into the double store to save work.
 | 
			
		||||
  // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) 
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  for (int mu = 0; mu < Nd; mu++) {
 | 
			
		||||
 | 
			
		||||
    U = PeekIndex<LorentzIndex>(Umu, mu);
 | 
			
		||||
    PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
 | 
			
		||||
    
 | 
			
		||||
    U = PeekIndex<LorentzIndex>(Umu, mu+4);
 | 
			
		||||
    PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  CopyGaugeCheckerboards();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/////////////////////////////
 | 
			
		||||
// Implement the interface
 | 
			
		||||
/////////////////////////////
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
 | 
			
		||||
  out.Checkerboard() = in.Checkerboard();
 | 
			
		||||
  Dhop(in, out, DaggerNo);
 | 
			
		||||
  axpy(out, mass, in, out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
 | 
			
		||||
  out.Checkerboard() = in.Checkerboard();
 | 
			
		||||
  Dhop(in, out, DaggerYes);
 | 
			
		||||
  axpy(out, mass, in, out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
 | 
			
		||||
  if (in.Checkerboard() == Odd) {
 | 
			
		||||
    DhopEO(in, out, DaggerNo);
 | 
			
		||||
  } else {
 | 
			
		||||
    DhopOE(in, out, DaggerNo);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
 | 
			
		||||
  if (in.Checkerboard() == Odd) {
 | 
			
		||||
    DhopEO(in, out, DaggerYes);
 | 
			
		||||
  } else {
 | 
			
		||||
    DhopOE(in, out, DaggerYes);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
 | 
			
		||||
  out.Checkerboard() = in.Checkerboard();
 | 
			
		||||
  typename FermionField::scalar_type scal(mass);
 | 
			
		||||
  out = scal * in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
 | 
			
		||||
  out.Checkerboard() = in.Checkerboard();
 | 
			
		||||
  Mooee(in, out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
 | 
			
		||||
  out.Checkerboard() = in.Checkerboard();
 | 
			
		||||
  out = (1.0 / (mass)) * in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) 
 | 
			
		||||
{
 | 
			
		||||
  out.Checkerboard() = in.Checkerboard();
 | 
			
		||||
  MooeeInv(in, out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////
 | 
			
		||||
// Internal
 | 
			
		||||
///////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
 | 
			
		||||
						GaugeField & mat,
 | 
			
		||||
						const FermionField &A, const FermionField &B, int dag) 
 | 
			
		||||
{
 | 
			
		||||
  assert((dag == DaggerNo) || (dag == DaggerYes));
 | 
			
		||||
 | 
			
		||||
  Compressor compressor;
 | 
			
		||||
 | 
			
		||||
  FermionField Btilde(B.Grid());
 | 
			
		||||
  FermionField Atilde(B.Grid());
 | 
			
		||||
  Atilde = A;
 | 
			
		||||
 | 
			
		||||
  st.HaloExchange(B, compressor);
 | 
			
		||||
 | 
			
		||||
  for (int mu = 0; mu < Nd; mu++) {
 | 
			
		||||
 | 
			
		||||
    ////////////////////////
 | 
			
		||||
    // Call the single hop
 | 
			
		||||
    ////////////////////////
 | 
			
		||||
    autoView( U_v      , U, CpuRead);
 | 
			
		||||
    autoView( B_v      , B, CpuWrite);
 | 
			
		||||
    autoView( Btilde_v , Btilde, CpuWrite);
 | 
			
		||||
    thread_for(sss,B.Grid()->oSites(),{
 | 
			
		||||
      Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    assert(0);// need to figure out the force interface with a blasted three link term.
 | 
			
		||||
    
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
 | 
			
		||||
 | 
			
		||||
  conformable(U.Grid(), _grid);
 | 
			
		||||
  conformable(U.Grid(), V.Grid());
 | 
			
		||||
  conformable(U.Grid(), mat.Grid());
 | 
			
		||||
 | 
			
		||||
  mat.Checkerboard() = U.Checkerboard();
 | 
			
		||||
 | 
			
		||||
  DerivInternal(Stencil, Umu, mat, U, V, dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
 | 
			
		||||
 | 
			
		||||
  conformable(U.Grid(), _cbgrid);
 | 
			
		||||
  conformable(U.Grid(), V.Grid());
 | 
			
		||||
  conformable(U.Grid(), mat.Grid());
 | 
			
		||||
 | 
			
		||||
  assert(V.Checkerboard() == Even);
 | 
			
		||||
  assert(U.Checkerboard() == Odd);
 | 
			
		||||
  mat.Checkerboard() = Odd;
 | 
			
		||||
 | 
			
		||||
  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
 | 
			
		||||
 | 
			
		||||
  conformable(U.Grid(), _cbgrid);
 | 
			
		||||
  conformable(U.Grid(), V.Grid());
 | 
			
		||||
  conformable(U.Grid(), mat.Grid());
 | 
			
		||||
 | 
			
		||||
  assert(V.Checkerboard() == Odd);
 | 
			
		||||
  assert(U.Checkerboard() == Even);
 | 
			
		||||
  mat.Checkerboard() = Even;
 | 
			
		||||
 | 
			
		||||
  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=2;
 | 
			
		||||
  conformable(in.Grid(), _grid);  // verifies full grid
 | 
			
		||||
  conformable(in.Grid(), out.Grid());
 | 
			
		||||
 | 
			
		||||
  out.Checkerboard() = in.Checkerboard();
 | 
			
		||||
 | 
			
		||||
  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=1;
 | 
			
		||||
  conformable(in.Grid(), _cbgrid);    // verifies half grid
 | 
			
		||||
  conformable(in.Grid(), out.Grid());  // drops the cb check
 | 
			
		||||
 | 
			
		||||
  assert(in.Checkerboard() == Even);
 | 
			
		||||
  out.Checkerboard() = Odd;
 | 
			
		||||
 | 
			
		||||
  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=1;
 | 
			
		||||
  conformable(in.Grid(), _cbgrid);    // verifies half grid
 | 
			
		||||
  conformable(in.Grid(), out.Grid());  // drops the cb check
 | 
			
		||||
 | 
			
		||||
  assert(in.Checkerboard() == Odd);
 | 
			
		||||
  out.Checkerboard() = Even;
 | 
			
		||||
 | 
			
		||||
  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) 
 | 
			
		||||
{
 | 
			
		||||
  DhopDir(in, out, dir, disp);
 | 
			
		||||
}
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
 | 
			
		||||
{
 | 
			
		||||
  assert(0); // Not implemented yet
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) 
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  Compressor compressor;
 | 
			
		||||
  Stencil.HaloExchange(in, compressor);
 | 
			
		||||
  autoView( Umu_v   ,  Umu, CpuRead);
 | 
			
		||||
  autoView( in_v    ,  in, CpuRead);
 | 
			
		||||
  autoView( out_v   , out, CpuWrite);
 | 
			
		||||
  //  thread_for( sss, in.Grid()->oSites(),{
 | 
			
		||||
  //    Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
 | 
			
		||||
  //  });
 | 
			
		||||
  assert(0);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
					       DoubledGaugeField &U,
 | 
			
		||||
					       const FermionField &in,
 | 
			
		||||
					       FermionField &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
 | 
			
		||||
    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
 | 
			
		||||
  else
 | 
			
		||||
    DhopInternalSerialComms(st,lo,U,in,out,dag);
 | 
			
		||||
}
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
							      DoubledGaugeField &U,
 | 
			
		||||
							      const FermionField &in,
 | 
			
		||||
							      FermionField &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
  Compressor compressor; 
 | 
			
		||||
  int len =  U.Grid()->oSites();
 | 
			
		||||
 | 
			
		||||
  DhopTotalTime   -= usecond();
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime    -= usecond();
 | 
			
		||||
  st.Prepare();
 | 
			
		||||
  st.HaloGather(in,compressor);
 | 
			
		||||
  DhopFaceTime    += usecond();
 | 
			
		||||
 | 
			
		||||
  DhopCommTime -=usecond();
 | 
			
		||||
  std::vector<std::vector<CommsRequest_t> > requests;
 | 
			
		||||
  st.CommunicateBegin(requests);
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMergeSHM(compressor);
 | 
			
		||||
  DhopFaceTime+= usecond();
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Removed explicit thread comms
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  DhopComputeTime    -= usecond();
 | 
			
		||||
  {
 | 
			
		||||
    int interior=1;
 | 
			
		||||
    int exterior=0;
 | 
			
		||||
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime    += usecond();
 | 
			
		||||
 | 
			
		||||
  st.CommunicateComplete(requests);
 | 
			
		||||
  DhopCommTime +=usecond();
 | 
			
		||||
 | 
			
		||||
  // First to enter, last to leave timing
 | 
			
		||||
  DhopFaceTime    -= usecond();
 | 
			
		||||
  st.CommsMerge(compressor);
 | 
			
		||||
  DhopFaceTime    -= usecond();
 | 
			
		||||
 | 
			
		||||
  DhopComputeTime2    -= usecond();
 | 
			
		||||
  {
 | 
			
		||||
    int interior=0;
 | 
			
		||||
    int exterior=1;
 | 
			
		||||
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime2    += usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
							  DoubledGaugeField &U,
 | 
			
		||||
							  const FermionField &in,
 | 
			
		||||
							  FermionField &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
  assert((dag == DaggerNo) || (dag == DaggerYes));
 | 
			
		||||
 | 
			
		||||
  DhopTotalTime   -= usecond();
 | 
			
		||||
 | 
			
		||||
  DhopCommTime    -= usecond();
 | 
			
		||||
  Compressor compressor;
 | 
			
		||||
  st.HaloExchange(in, compressor);
 | 
			
		||||
  DhopCommTime    += usecond();
 | 
			
		||||
 | 
			
		||||
  DhopComputeTime -= usecond();
 | 
			
		||||
  {
 | 
			
		||||
    int interior=1;
 | 
			
		||||
    int exterior=1;
 | 
			
		||||
    Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime += usecond();
 | 
			
		||||
  DhopTotalTime   += usecond();
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Reporting
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::Report(void) 
 | 
			
		||||
{
 | 
			
		||||
  Coordinate latt = _grid->GlobalDimensions();
 | 
			
		||||
  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
 | 
			
		||||
  RealD NP = _grid->_Nprocessors;
 | 
			
		||||
  RealD NN = _grid->NodeCount();
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls   : " 
 | 
			
		||||
	    << DhopCalls   << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime   /Calls       : " 
 | 
			
		||||
	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime    /Calls       : " 
 | 
			
		||||
	    << DhopCommTime    / DhopCalls << " us" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls        : " 
 | 
			
		||||
	    << DhopComputeTime / DhopCalls << " us" << std::endl;
 | 
			
		||||
 | 
			
		||||
  // Average the compute time
 | 
			
		||||
  _grid->GlobalSum(DhopComputeTime);
 | 
			
		||||
  DhopComputeTime/=NP;
 | 
			
		||||
 | 
			
		||||
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
 | 
			
		||||
}
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::ZeroCounters(void) 
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls       = 0;
 | 
			
		||||
  DhopTotalTime   = 0;
 | 
			
		||||
  DhopCommTime    = 0;
 | 
			
		||||
  DhopComputeTime = 0;
 | 
			
		||||
  DhopFaceTime    = 0;
 | 
			
		||||
 | 
			
		||||
  Stencil.ZeroCounters();
 | 
			
		||||
  StencilEven.ZeroCounters();
 | 
			
		||||
  StencilOdd.ZeroCounters();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////// 
 | 
			
		||||
// Conserved current - not yet implemented.
 | 
			
		||||
////////////////////////////////////////////////////////
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
							      PropagatorField &q_in_2,
 | 
			
		||||
							      PropagatorField &q_out,
 | 
			
		||||
							      PropagatorField &src,
 | 
			
		||||
							      Current curr_type,
 | 
			
		||||
							      unsigned int mu)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
                                                         PropagatorField &q_out,
 | 
			
		||||
                                                         PropagatorField &src,
 | 
			
		||||
                                                         Current curr_type,
 | 
			
		||||
                                                         unsigned int mu, 
 | 
			
		||||
                                                         unsigned int tmin,
 | 
			
		||||
                                              unsigned int tmax,
 | 
			
		||||
					      ComplexField &lattice_cmplx)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
@@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
 | 
			
		||||
					 DoubledGaugeFieldView &U,
 | 
			
		||||
					 DoubledGaugeFieldView &UUU,
 | 
			
		||||
					 SiteSpinor *buf, int LLs,
 | 
			
		||||
					 SiteSpinor *buf, int sF,
 | 
			
		||||
					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
@@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
  gauge2 =(uint64_t)&UU[sU]( Z );				\
 | 
			
		||||
  gauge3 =(uint64_t)&UU[sU]( T ); 
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
  // This is the single precision 5th direction vectorised kernel
 | 
			
		||||
#include <Grid/simd/Intel512single.h>
 | 
			
		||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
 | 
			
		||||
								    DoubledGaugeFieldView &U,
 | 
			
		||||
								    DoubledGaugeFieldView &UUU,
 | 
			
		||||
								    SiteSpinor *buf, int LLs,
 | 
			
		||||
								    SiteSpinor *buf, int sF,
 | 
			
		||||
								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
@@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
 | 
			
		||||
  StencilEntry *SE2;
 | 
			
		||||
  StencilEntry *SE3;
 | 
			
		||||
 | 
			
		||||
   for(int s=0;s<LLs;s++){
 | 
			
		||||
  //   for(int s=0;s<LLs;s++){
 | 
			
		||||
 | 
			
		||||
    int sF=s+LLs*sU;
 | 
			
		||||
  //    int sF=s+LLs*sU;
 | 
			
		||||
  {
 | 
			
		||||
    // Xp, Yp, Zp, Tp
 | 
			
		||||
    PREPARE(Xp,Yp,Zp,Tp,0,U);
 | 
			
		||||
    LOAD_CHI(addr0,addr1,addr2,addr3);
 | 
			
		||||
@@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#include <Grid/simd/Intel512double.h>
 | 
			
		||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st, 
 | 
			
		||||
								    DoubledGaugeFieldView &U,
 | 
			
		||||
								    DoubledGaugeFieldView &UUU,
 | 
			
		||||
								    SiteSpinor *buf, int LLs,
 | 
			
		||||
								    SiteSpinor *buf, int sF,
 | 
			
		||||
								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
@@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
 | 
			
		||||
  StencilEntry *SE2;
 | 
			
		||||
  StencilEntry *SE3;
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    int sF=s+LLs*sU;
 | 
			
		||||
  //  for(int s=0;s<LLs;s++){
 | 
			
		||||
  //    int sF=s+LLs*sU;
 | 
			
		||||
  {
 | 
			
		||||
    // Xp, Yp, Zp, Tp
 | 
			
		||||
    PREPARE(Xp,Yp,Zp,Tp,0,U);
 | 
			
		||||
    LOAD_CHI(addr0,addr1,addr2,addr3);
 | 
			
		||||
@@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
 | 
			
		||||
  // This is the single precision 5th direction vectorised kernel
 | 
			
		||||
 | 
			
		||||
#include <Grid/simd/Intel512single.h>
 | 
			
		||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st, 
 | 
			
		||||
							       DoubledGaugeFieldView &U,
 | 
			
		||||
							       DoubledGaugeFieldView &UUU,
 | 
			
		||||
							       SiteSpinor *buf, int LLs,
 | 
			
		||||
							       SiteSpinor *buf, int sF,
 | 
			
		||||
							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
@@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 | 
			
		||||
  StencilEntry *SE2;
 | 
			
		||||
  StencilEntry *SE3;
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    
 | 
			
		||||
    int sF=s+LLs*sU;
 | 
			
		||||
  //  for(int s=0;s<LLs;s++){
 | 
			
		||||
  //    int sF=s+LLs*sU;
 | 
			
		||||
  {
 | 
			
		||||
    // Xp, Yp, Zp, Tp
 | 
			
		||||
    PREPARE(Xp,Yp,Zp,Tp,0,U);
 | 
			
		||||
    LOAD_CHIa(addr0,addr1);
 | 
			
		||||
@@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#include <Grid/simd/Intel512double.h>
 | 
			
		||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st, 
 | 
			
		||||
							       DoubledGaugeFieldView &U,
 | 
			
		||||
							       DoubledGaugeFieldView &UUU,
 | 
			
		||||
							       SiteSpinor *buf, int LLs,
 | 
			
		||||
							       SiteSpinor *buf, int sF,
 | 
			
		||||
							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
@@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
 | 
			
		||||
  StencilEntry *SE2;
 | 
			
		||||
  StencilEntry *SE3;
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    
 | 
			
		||||
    int sF=s+LLs*sU;
 | 
			
		||||
  //  for(int s=0;s<LLs;s++){
 | 
			
		||||
  //    int sF=s+LLs*sU;
 | 
			
		||||
  {
 | 
			
		||||
    // Xp, Yp, Zp, Tp
 | 
			
		||||
    PREPARE(Xp,Yp,Zp,Tp,0,U);
 | 
			
		||||
    LOAD_CHIa(addr0,addr1);
 | 
			
		||||
 
 | 
			
		||||
@@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
template <int Naik>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
 | 
			
		||||
					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
 | 
			
		||||
					  SiteSpinor *buf, int LLs, int sU, 
 | 
			
		||||
					  SiteSpinor *buf, int sF, int sU, 
 | 
			
		||||
					  const FermionFieldView &in, FermionFieldView &out,int dag) 
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
@@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int skew;
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    int sF=s+LLs*sU;
 | 
			
		||||
  //  for(int s=0;s<LLs;s++){
 | 
			
		||||
  //    int sF=s+LLs*sU;
 | 
			
		||||
  {
 | 
			
		||||
 | 
			
		||||
    skew = 0;
 | 
			
		||||
    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
 | 
			
		||||
@@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
 | 
			
		||||
    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
 | 
			
		||||
    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
 | 
			
		||||
    if (Naik) {
 | 
			
		||||
    skew = 8;
 | 
			
		||||
    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
 | 
			
		||||
    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
 | 
			
		||||
@@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
 | 
			
		||||
    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
 | 
			
		||||
    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
 | 
			
		||||
    
 | 
			
		||||
    }    
 | 
			
		||||
    if ( dag ) {
 | 
			
		||||
      result()()(0) = - even_0 - odd_0;
 | 
			
		||||
      result()()(1) = - even_1 - odd_1;
 | 
			
		||||
@@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
template <int Naik>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
 | 
			
		||||
					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 | 
			
		||||
					     SiteSpinor *buf, int LLs, int sU, 
 | 
			
		||||
					     SiteSpinor *buf, int sF, int sU, 
 | 
			
		||||
					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
@@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int skew;
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    int sF=s+LLs*sU;
 | 
			
		||||
  //  for(int s=0;s<LLs;s++){
 | 
			
		||||
  //    int sF=s+LLs*sU;
 | 
			
		||||
  {
 | 
			
		||||
 | 
			
		||||
    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
 | 
			
		||||
     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
 | 
			
		||||
@@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
 | 
			
		||||
    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
 | 
			
		||||
    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
 | 
			
		||||
    if (Naik) {
 | 
			
		||||
    skew = 8;
 | 
			
		||||
    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
 | 
			
		||||
    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
 | 
			
		||||
@@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
 | 
			
		||||
    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
 | 
			
		||||
    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
 | 
			
		||||
    if ( dag ) {
 | 
			
		||||
      result()()(0) = - even_0 - odd_0;
 | 
			
		||||
@@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
template <int Naik>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 | 
			
		||||
					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 | 
			
		||||
					     SiteSpinor *buf, int LLs, int sU, 
 | 
			
		||||
					     SiteSpinor *buf, int sF, int sU, 
 | 
			
		||||
					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
@@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int skew;
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    int sF=s+LLs*sU;
 | 
			
		||||
  //  for(int s=0;s<LLs;s++){
 | 
			
		||||
  //    int sF=s+LLs*sU;
 | 
			
		||||
  {
 | 
			
		||||
 | 
			
		||||
    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
 | 
			
		||||
     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
 | 
			
		||||
@@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
 | 
			
		||||
    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
 | 
			
		||||
    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
 | 
			
		||||
    if (Naik) {
 | 
			
		||||
    skew = 8;
 | 
			
		||||
    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
 | 
			
		||||
    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
 | 
			
		||||
@@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
 | 
			
		||||
    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
 | 
			
		||||
    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    // Add sum of all exterior connected stencil legs
 | 
			
		||||
    if ( nmu ) { 
 | 
			
		||||
      if ( dag ) {
 | 
			
		||||
@@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
 | 
			
		||||
  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
 | 
			
		||||
						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 | 
			
		||||
@@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
 | 
			
		||||
						     SiteSpinor *buf, int LLs, int sU, \
 | 
			
		||||
						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 | 
			
		||||
 | 
			
		||||
*/
 | 
			
		||||
#undef LOAD_CHI
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 
 | 
			
		||||
@@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
  if (SE->_is_local ) {						\
 | 
			
		||||
    if (SE->_permute) {						\
 | 
			
		||||
      chi_p = χ						\
 | 
			
		||||
      permute(chi,  in[SE->_offset], ptype);		\
 | 
			
		||||
      permute(chi,  in[SE->_offset], ptype);			\
 | 
			
		||||
    } else {							\
 | 
			
		||||
      chi_p = &in[SE->_offset];				\
 | 
			
		||||
      chi_p = &in[SE->_offset];					\
 | 
			
		||||
    }								\
 | 
			
		||||
  } else {							\
 | 
			
		||||
    chi_p = &buf[SE->_offset];					\
 | 
			
		||||
@@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
  if (SE->_is_local ) {						\
 | 
			
		||||
    if (SE->_permute) {						\
 | 
			
		||||
      chi_p = χ						\
 | 
			
		||||
      permute(chi,  in[SE->_offset], ptype);		\
 | 
			
		||||
      permute(chi,  in[SE->_offset], ptype);			\
 | 
			
		||||
    } else {							\
 | 
			
		||||
      chi_p = &in[SE->_offset];				\
 | 
			
		||||
      chi_p = &in[SE->_offset];					\
 | 
			
		||||
    }								\
 | 
			
		||||
  } else if ( st.same_node[Dir] ) {				\
 | 
			
		||||
    chi_p = &buf[SE->_offset];					\
 | 
			
		||||
  }								\
 | 
			
		||||
  if (SE->_is_local || st.same_node[Dir] ) {			\
 | 
			
		||||
    multLink(Uchi, U[sU], *chi_p, Dir);			\
 | 
			
		||||
    multLink(Uchi, U[sU], *chi_p, Dir);				\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
 | 
			
		||||
@@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
 | 
			
		||||
    nmu++;							\
 | 
			
		||||
    chi_p = &buf[SE->_offset];					\
 | 
			
		||||
    multLink(Uchi, U[sU], *chi_p, Dir);			\
 | 
			
		||||
    multLink(Uchi, U[sU], *chi_p, Dir);				\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
@@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 | 
			
		||||
// Int, Ext, Int+Ext cases for comms overlap
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
template <int Naik>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
 | 
			
		||||
					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 | 
			
		||||
					     SiteSpinor *buf, int LLs, int sU, 
 | 
			
		||||
					     const FermionFieldView &in, FermionFieldView &out, int dag) {
 | 
			
		||||
					     SiteSpinor *buf, int sF, int sU, 
 | 
			
		||||
					     const FermionFieldView &in, FermionFieldView &out, int dag) 
 | 
			
		||||
{
 | 
			
		||||
  const SiteSpinor *chi_p;
 | 
			
		||||
  SiteSpinor chi;
 | 
			
		||||
  SiteSpinor Uchi;
 | 
			
		||||
@@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
  int ptype;
 | 
			
		||||
  int skew;
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    int sF=LLs*sU+s;
 | 
			
		||||
  //  for(int s=0;s<LLs;s++){
 | 
			
		||||
  //
 | 
			
		||||
  //    int sF=LLs*sU+s;
 | 
			
		||||
  {
 | 
			
		||||
    skew = 0;
 | 
			
		||||
    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
 | 
			
		||||
    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
 | 
			
		||||
@@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
 | 
			
		||||
    if ( Naik ) {
 | 
			
		||||
    skew=8;
 | 
			
		||||
    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
 | 
			
		||||
@@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
 | 
			
		||||
    }
 | 
			
		||||
    if ( dag ) { 
 | 
			
		||||
      Uchi = - Uchi;
 | 
			
		||||
    } 
 | 
			
		||||
@@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
  // Only contributions from interior of our node
 | 
			
		||||
  ///////////////////////////////////////////////////
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
template <int Naik>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 | 
			
		||||
						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 | 
			
		||||
						SiteSpinor *buf, int LLs, int sU, 
 | 
			
		||||
						SiteSpinor *buf, int sF, int sU, 
 | 
			
		||||
						const FermionFieldView &in, FermionFieldView &out,int dag) {
 | 
			
		||||
  const SiteSpinor *chi_p;
 | 
			
		||||
  SiteSpinor chi;
 | 
			
		||||
@@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
 | 
			
		||||
  int ptype;
 | 
			
		||||
  int skew ;
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    int sF=LLs*sU+s;
 | 
			
		||||
  //  for(int s=0;s<LLs;s++){
 | 
			
		||||
  //    int sF=LLs*sU+s;
 | 
			
		||||
  {
 | 
			
		||||
    skew = 0;
 | 
			
		||||
    Uchi=Zero();
 | 
			
		||||
    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
 | 
			
		||||
@@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
 | 
			
		||||
    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
 | 
			
		||||
    if ( Naik ) {
 | 
			
		||||
    skew=8;
 | 
			
		||||
    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
 | 
			
		||||
@@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
 | 
			
		||||
    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
 | 
			
		||||
    }
 | 
			
		||||
    if ( dag ) {
 | 
			
		||||
      Uchi = - Uchi;
 | 
			
		||||
    }
 | 
			
		||||
@@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
 | 
			
		||||
  // Only contributions from exterior of our node
 | 
			
		||||
  ///////////////////////////////////////////////////
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
template <int Naik>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 | 
			
		||||
						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 | 
			
		||||
						SiteSpinor *buf, int LLs, int sU,
 | 
			
		||||
						SiteSpinor *buf, int sF, int sU,
 | 
			
		||||
						const FermionFieldView &in, FermionFieldView &out,int dag) {
 | 
			
		||||
  const SiteSpinor *chi_p;
 | 
			
		||||
  //  SiteSpinor chi;
 | 
			
		||||
@@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
  int skew ;
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<LLs;s++){
 | 
			
		||||
    int sF=LLs*sU+s;
 | 
			
		||||
  //  for(int s=0;s<LLs;s++){
 | 
			
		||||
  //    int sF=LLs*sU+s;
 | 
			
		||||
  {
 | 
			
		||||
    skew = 0;
 | 
			
		||||
    Uchi=Zero();
 | 
			
		||||
    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
 | 
			
		||||
@@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
 | 
			
		||||
    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
 | 
			
		||||
    if ( Naik ) {
 | 
			
		||||
    skew=8;
 | 
			
		||||
    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
 | 
			
		||||
@@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
 | 
			
		||||
    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
 | 
			
		||||
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    if ( nmu ) { 
 | 
			
		||||
      if ( dag ) { 
 | 
			
		||||
	out[sF] = out[sF] - Uchi;
 | 
			
		||||
@@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Driving / wrapping routine to select right kernel
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 | 
			
		||||
					 SiteSpinor *buf, int LLs, int sU,
 | 
			
		||||
					 const FermionFieldView &in, FermionFieldView &out,
 | 
			
		||||
					 int interior,int exterior)
 | 
			
		||||
{
 | 
			
		||||
  int dag=1;
 | 
			
		||||
  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 | 
			
		||||
				      SiteSpinor *buf, int LLs, int sU,
 | 
			
		||||
				      const FermionFieldView &in, FermionFieldView &out,
 | 
			
		||||
				      int interior,int exterior)
 | 
			
		||||
{
 | 
			
		||||
  int dag=0;
 | 
			
		||||
  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 | 
			
		||||
				      SiteSpinor *buf, int LLs,
 | 
			
		||||
				      int sU, const FermionFieldView &in, FermionFieldView &out,
 | 
			
		||||
				      int dag,int interior,int exterior) 
 | 
			
		||||
{
 | 
			
		||||
  switch(Opt) {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
  case OptInlineAsm:
 | 
			
		||||
    if ( interior && exterior ) {
 | 
			
		||||
      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
    } else { 
 | 
			
		||||
      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
    break;
 | 
			
		||||
#endif
 | 
			
		||||
  case OptHandUnroll:
 | 
			
		||||
    if ( interior && exterior ) {
 | 
			
		||||
      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
    } else if ( interior ) {
 | 
			
		||||
      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
    } else if ( exterior ) {
 | 
			
		||||
      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
    }
 | 
			
		||||
    break;
 | 
			
		||||
  case OptGeneric:
 | 
			
		||||
    if ( interior && exterior ) {
 | 
			
		||||
      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
    } else if ( interior ) {
 | 
			
		||||
      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
    } else if ( exterior ) {
 | 
			
		||||
      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
 | 
			
		||||
    }
 | 
			
		||||
    break;
 | 
			
		||||
  default:
 | 
			
		||||
    std::cout<<"Oops Opt = "<<Opt<<std::endl;
 | 
			
		||||
    assert(0);
 | 
			
		||||
    break;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
 | 
			
		||||
					    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
 | 
			
		||||
void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
 | 
			
		||||
					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 | 
			
		||||
{
 | 
			
		||||
  // Disp should be either +1,-1,+3,-3
 | 
			
		||||
  // What about "dag" ?
 | 
			
		||||
@@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define KERNEL_CALLNB(A,improved)					\
 | 
			
		||||
  const uint64_t    NN = Nsite*Ls;					\
 | 
			
		||||
  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
 | 
			
		||||
      int sF = ss;							\
 | 
			
		||||
      int sU = ss/Ls;							\
 | 
			
		||||
      ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier(); 
 | 
			
		||||
 | 
			
		||||
#define ASM_CALL(A)							\
 | 
			
		||||
  const uint64_t    NN = Nsite*Ls;					\
 | 
			
		||||
  thread_for( ss, NN, {							\
 | 
			
		||||
      int sF = ss;							\
 | 
			
		||||
      int sU = ss/Ls;							\
 | 
			
		||||
      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 | 
			
		||||
					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 | 
			
		||||
{
 | 
			
		||||
  GridBase *FGrid=in.Grid();  
 | 
			
		||||
  GridBase *UGrid=U.Grid();  
 | 
			
		||||
  typedef StaggeredKernels<Impl> ThisKernel;
 | 
			
		||||
  autoView( UUU_v , UUU, AcceleratorRead);
 | 
			
		||||
  autoView( U_v   ,   U, AcceleratorRead);
 | 
			
		||||
  autoView( in_v  ,  in, AcceleratorRead);
 | 
			
		||||
  autoView( out_v , out, AcceleratorWrite);
 | 
			
		||||
  autoView( st_v  ,  st, AcceleratorRead);
 | 
			
		||||
  SiteSpinor * buf = st.CommBuf();
 | 
			
		||||
    
 | 
			
		||||
  int Ls=1;
 | 
			
		||||
  if(FGrid->Nd()==UGrid->Nd()+1){
 | 
			
		||||
    Ls    = FGrid->_rdimensions[0];
 | 
			
		||||
  }
 | 
			
		||||
  int Nsite = UGrid->oSites();
 | 
			
		||||
 | 
			
		||||
  if( interior && exterior ) { 
 | 
			
		||||
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
 | 
			
		||||
    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
 | 
			
		||||
#endif
 | 
			
		||||
  } else if( interior ) {
 | 
			
		||||
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
 | 
			
		||||
#endif
 | 
			
		||||
  } else if( exterior ) { 
 | 
			
		||||
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
  assert(0 && " Kernel optimisation case not covered ");
 | 
			
		||||
}
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, 
 | 
			
		||||
				       DoubledGaugeField &U,
 | 
			
		||||
				       const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 | 
			
		||||
{
 | 
			
		||||
  GridBase *FGrid=in.Grid();  
 | 
			
		||||
  GridBase *UGrid=U.Grid();  
 | 
			
		||||
  typedef StaggeredKernels<Impl> ThisKernel;
 | 
			
		||||
  autoView( UUU_v ,   U, AcceleratorRead);
 | 
			
		||||
  autoView( U_v   ,   U, AcceleratorRead);
 | 
			
		||||
  autoView( in_v  ,  in, AcceleratorRead);
 | 
			
		||||
  autoView( out_v , out, AcceleratorWrite);
 | 
			
		||||
  autoView( st_v  ,  st, AcceleratorRead);
 | 
			
		||||
  SiteSpinor * buf = st.CommBuf();
 | 
			
		||||
 | 
			
		||||
  int Ls=1;
 | 
			
		||||
  if(FGrid->Nd()==UGrid->Nd()+1){
 | 
			
		||||
    Ls    = FGrid->_rdimensions[0];
 | 
			
		||||
  }
 | 
			
		||||
  int Nsite = UGrid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  if( interior && exterior ) { 
 | 
			
		||||
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
 | 
			
		||||
#endif
 | 
			
		||||
  } else if( interior ) {
 | 
			
		||||
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
 | 
			
		||||
#endif
 | 
			
		||||
  } else if( exterior ) { 
 | 
			
		||||
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef KERNEL_CALLNB
 | 
			
		||||
#undef KERNEL_CALL
 | 
			
		||||
#undef ASM_CALL
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -98,32 +98,35 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
			
		||||
  Coordinate lcoor;
 | 
			
		||||
  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
 | 
			
		||||
 | 
			
		||||
  for (int site = 0; site < lvol; site++)
 | 
			
		||||
  {
 | 
			
		||||
    grid->LocalIndexToLocalCoor(site, lcoor);
 | 
			
		||||
    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
 | 
			
		||||
    peekLocalSite(Qx, CloverTerm, lcoor);
 | 
			
		||||
    Qxinv = Zero();
 | 
			
		||||
    //if (csw!=0){
 | 
			
		||||
    for (int j = 0; j < Ns; j++)
 | 
			
		||||
      for (int k = 0; k < Ns; k++)
 | 
			
		||||
        for (int a = 0; a < DimRep; a++)
 | 
			
		||||
          for (int b = 0; b < DimRep; b++){
 | 
			
		||||
	    auto zz =  Qx()(j, k)(a, b);
 | 
			
		||||
            EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
 | 
			
		||||
	  }
 | 
			
		||||
    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
 | 
			
		||||
 | 
			
		||||
    EigenInvCloverOp = EigenCloverOp.inverse();
 | 
			
		||||
    //std::cout << EigenInvCloverOp << std::endl;
 | 
			
		||||
    for (int j = 0; j < Ns; j++)
 | 
			
		||||
      for (int k = 0; k < Ns; k++)
 | 
			
		||||
        for (int a = 0; a < DimRep; a++)
 | 
			
		||||
          for (int b = 0; b < DimRep; b++)
 | 
			
		||||
            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
 | 
			
		||||
    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
 | 
			
		||||
    //  }
 | 
			
		||||
    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
 | 
			
		||||
    autoView(CTv,CloverTerm,CpuRead);
 | 
			
		||||
    autoView(CTIv,CloverTermInv,CpuWrite);
 | 
			
		||||
    for (int site = 0; site < lvol; site++) {
 | 
			
		||||
      grid->LocalIndexToLocalCoor(site, lcoor);
 | 
			
		||||
      EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
 | 
			
		||||
      peekLocalSite(Qx, CTv, lcoor);
 | 
			
		||||
      Qxinv = Zero();
 | 
			
		||||
      //if (csw!=0){
 | 
			
		||||
      for (int j = 0; j < Ns; j++)
 | 
			
		||||
	for (int k = 0; k < Ns; k++)
 | 
			
		||||
	  for (int a = 0; a < DimRep; a++)
 | 
			
		||||
	    for (int b = 0; b < DimRep; b++){
 | 
			
		||||
	      auto zz =  Qx()(j, k)(a, b);
 | 
			
		||||
	      EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
 | 
			
		||||
	    }
 | 
			
		||||
      //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
 | 
			
		||||
      
 | 
			
		||||
      EigenInvCloverOp = EigenCloverOp.inverse();
 | 
			
		||||
      //std::cout << EigenInvCloverOp << std::endl;
 | 
			
		||||
      for (int j = 0; j < Ns; j++)
 | 
			
		||||
	for (int k = 0; k < Ns; k++)
 | 
			
		||||
	  for (int a = 0; a < DimRep; a++)
 | 
			
		||||
	    for (int b = 0; b < DimRep; b++)
 | 
			
		||||
	      Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
 | 
			
		||||
      //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
 | 
			
		||||
      //  }
 | 
			
		||||
      pokeLocalSite(Qxinv, CTIv, lcoor);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Separate the even and odd parts
 | 
			
		||||
 
 | 
			
		||||
@@ -580,16 +580,21 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
 | 
			
		||||
  cosha = (one + W*W + sk) / (abs(W)*2.0);
 | 
			
		||||
 | 
			
		||||
  // FIXME Need a Lattice acosh
 | 
			
		||||
  for(int idx=0;idx<_grid->lSites();idx++){
 | 
			
		||||
    Coordinate lcoor(Nd);
 | 
			
		||||
    Tcomplex cc;
 | 
			
		||||
    //    RealD sgn;
 | 
			
		||||
    _grid->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    peekLocalSite(cc,cosha,lcoor);
 | 
			
		||||
    assert((double)real(cc)>=1.0);
 | 
			
		||||
    assert(fabs((double)imag(cc))<=1.0e-15);
 | 
			
		||||
    cc = ScalComplex(::acosh(real(cc)),0.0);
 | 
			
		||||
    pokeLocalSite(cc,a,lcoor);
 | 
			
		||||
 | 
			
		||||
  {
 | 
			
		||||
    autoView(cosha_v,cosha,CpuRead);
 | 
			
		||||
    autoView(a_v,a,CpuWrite);
 | 
			
		||||
    for(int idx=0;idx<_grid->lSites();idx++){
 | 
			
		||||
      Coordinate lcoor(Nd);
 | 
			
		||||
      Tcomplex cc;
 | 
			
		||||
      //    RealD sgn;
 | 
			
		||||
      _grid->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
      peekLocalSite(cc,cosha_v,lcoor);
 | 
			
		||||
      assert((double)real(cc)>=1.0);
 | 
			
		||||
      assert(fabs((double)imag(cc))<=1.0e-15);
 | 
			
		||||
      cc = ScalComplex(::acosh(real(cc)),0.0);
 | 
			
		||||
      pokeLocalSite(cc,a_v,lcoor);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Wea = ( exp( a) * abs(W)  );
 | 
			
		||||
@@ -775,17 +780,20 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
 | 
			
		||||
  cosha =  (one + W*W + sk) / (abs(W)*2.0);
 | 
			
		||||
 | 
			
		||||
  // FIXME Need a Lattice acosh
 | 
			
		||||
  {
 | 
			
		||||
  autoView(cosha_v,cosha,CpuRead);
 | 
			
		||||
  autoView(a_v,a,CpuWrite);
 | 
			
		||||
  for(int idx=0;idx<_grid->lSites();idx++){
 | 
			
		||||
    Coordinate lcoor(Nd);
 | 
			
		||||
    Tcomplex cc;
 | 
			
		||||
    //    RealD sgn;
 | 
			
		||||
    _grid->LocalIndexToLocalCoor(idx,lcoor);
 | 
			
		||||
    peekLocalSite(cc,cosha,lcoor);
 | 
			
		||||
    peekLocalSite(cc,cosha_v,lcoor);
 | 
			
		||||
    assert((double)real(cc)>=1.0);
 | 
			
		||||
    assert(fabs((double)imag(cc))<=1.0e-15);
 | 
			
		||||
    cc = ScalComplex(::acosh(real(cc)),0.0);
 | 
			
		||||
    pokeLocalSite(cc,a,lcoor);
 | 
			
		||||
  }
 | 
			
		||||
    pokeLocalSite(cc,a_v,lcoor);
 | 
			
		||||
  }}
 | 
			
		||||
  
 | 
			
		||||
  Wea = ( exp( a) * abs(W)  );
 | 
			
		||||
  Wema= ( exp(-a) * abs(W)  );
 | 
			
		||||
 
 | 
			
		||||
@@ -67,7 +67,12 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
 | 
			
		||||
    diag_mass = 4.0 + mass;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  int vol4;
 | 
			
		||||
  vol4=Fgrid.oSites();
 | 
			
		||||
  Stencil.BuildSurfaceList(1,vol4);
 | 
			
		||||
  vol4=Hgrid.oSites();
 | 
			
		||||
  StencilEven.BuildSurfaceList(1,vol4);
 | 
			
		||||
  StencilOdd.BuildSurfaceList(1,vol4);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
@@ -483,32 +488,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
  conformable(_grid, q_in_1.Grid());
 | 
			
		||||
  conformable(_grid, q_in_2.Grid());
 | 
			
		||||
  conformable(_grid, q_out.Grid());
 | 
			
		||||
#if 0
 | 
			
		||||
  PropagatorField tmp1(_grid), tmp2(_grid);
 | 
			
		||||
  q_out = Zero();
 | 
			
		||||
 | 
			
		||||
  // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
 | 
			
		||||
  // Inefficient comms method but not performance critical.
 | 
			
		||||
  tmp1 = Cshift(q_in_1, mu, 1);
 | 
			
		||||
  tmp2 = Cshift(q_in_2, mu, 1);
 | 
			
		||||
  auto tmp1_v  =  tmp1.View();
 | 
			
		||||
  auto tmp2_v  =  tmp2.View();
 | 
			
		||||
  auto q_in_1_v=q_in_1.View();
 | 
			
		||||
  auto q_in_2_v=q_in_2.View();
 | 
			
		||||
  auto q_out_v = q_out.View();
 | 
			
		||||
  auto Umu_v   =   Umu.View();
 | 
			
		||||
  thread_for(sU, Umu.Grid()->oSites(),{
 | 
			
		||||
      Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
 | 
			
		||||
					       q_in_2_v[sU],
 | 
			
		||||
					       q_out_v[sU],
 | 
			
		||||
					       Umu_v, sU, mu);
 | 
			
		||||
      Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
 | 
			
		||||
					       tmp2_v[sU],
 | 
			
		||||
					       q_out_v[sU],
 | 
			
		||||
					       Umu_v, sU, mu);
 | 
			
		||||
  });
 | 
			
		||||
#else
 | 
			
		||||
#endif
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -524,62 +504,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
{
 | 
			
		||||
  conformable(_grid, q_in.Grid());
 | 
			
		||||
  conformable(_grid, q_out.Grid());
 | 
			
		||||
#if 0
 | 
			
		||||
 | 
			
		||||
  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
 | 
			
		||||
  Complex i(0.0,1.0);
 | 
			
		||||
  PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
 | 
			
		||||
  unsigned int tshift = (mu == Tp) ? 1 : 0;
 | 
			
		||||
  unsigned int LLt    = GridDefaultLatt()[Tp];
 | 
			
		||||
 | 
			
		||||
  q_out = Zero();
 | 
			
		||||
  LatticeInteger coords(_grid);
 | 
			
		||||
  LatticeCoordinate(coords, Tp);
 | 
			
		||||
 | 
			
		||||
  // Need q(x + mu) and q(x - mu).
 | 
			
		||||
  tmp    = Cshift(q_in, mu, 1);
 | 
			
		||||
  tmpFwd = tmp*lattice_cmplx;
 | 
			
		||||
  tmp    = lattice_cmplx*q_in;
 | 
			
		||||
  tmpBwd = Cshift(tmp, mu, -1);
 | 
			
		||||
 | 
			
		||||
  auto coords_v = coords.View();
 | 
			
		||||
  auto tmpFwd_v = tmpFwd.View();
 | 
			
		||||
  auto tmpBwd_v = tmpBwd.View();
 | 
			
		||||
  auto Umu_v    = Umu.View();
 | 
			
		||||
  auto q_out_v  = q_out.View();
 | 
			
		||||
 | 
			
		||||
  thread_for(sU, Umu.Grid()->oSites(), {
 | 
			
		||||
 | 
			
		||||
    // Compute the sequential conserved current insertion only if our simd
 | 
			
		||||
    // object contains a timeslice we need.
 | 
			
		||||
    vPredicate t_mask;
 | 
			
		||||
    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
 | 
			
		||||
    Integer timeSlices = Reduce(t_mask());
 | 
			
		||||
 | 
			
		||||
    if (timeSlices > 0) {
 | 
			
		||||
      Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], 
 | 
			
		||||
					  q_out_v[sU], 
 | 
			
		||||
					  Umu_v, sU, mu, t_mask);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Repeat for backward direction.
 | 
			
		||||
    t_mask()     = ((coords_v[sU] >= (tmin + tshift)) && 
 | 
			
		||||
		    (coords_v[sU] <= (tmax + tshift)));
 | 
			
		||||
    
 | 
			
		||||
    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
 | 
			
		||||
    unsigned int t0 = 0;
 | 
			
		||||
    if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
 | 
			
		||||
    
 | 
			
		||||
    timeSlices = Reduce(t_mask());
 | 
			
		||||
 | 
			
		||||
    if (timeSlices > 0) {
 | 
			
		||||
      Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], 
 | 
			
		||||
					  q_out_v[sU], 
 | 
			
		||||
					  Umu_v, sU, mu, t_mask);
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
#else
 | 
			
		||||
#endif
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 
 | 
			
		||||
@@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
// Generic implementation; move to different file?
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 | 
			
		||||
{
 | 
			
		||||
#ifdef __CUDA_ARCH__
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
  static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); 
 | 
			
		||||
  uint4 * mem_pun  = (uint4 *)mem; // force 128 bit loads
 | 
			
		||||
  uint4 * chip_pun = (uint4 *)&chip;
 | 
			
		||||
@@ -51,7 +52,8 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 | 
			
		||||
#endif
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
  
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
 | 
			
		||||
  SE = st.GetEntry(ptype, Dir, sF);				\
 | 
			
		||||
  if (SE->_is_local) {						\
 | 
			
		||||
@@ -61,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 | 
			
		||||
  } else {							\
 | 
			
		||||
    chi = coalescedRead(buf[SE->_offset],lane);			\
 | 
			
		||||
  }								\
 | 
			
		||||
  synchronise();						\
 | 
			
		||||
  acceleratorSynchronise();						\
 | 
			
		||||
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
 | 
			
		||||
  Recon(result, Uchi);
 | 
			
		||||
  
 | 
			
		||||
@@ -74,12 +76,12 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 | 
			
		||||
  } else if ( st.same_node[Dir] ) {				\
 | 
			
		||||
    chi = coalescedRead(buf[SE->_offset],lane);			\
 | 
			
		||||
  }								\
 | 
			
		||||
  synchronise();						\
 | 
			
		||||
  acceleratorSynchronise();						\
 | 
			
		||||
  if (SE->_is_local || st.same_node[Dir] ) {			\
 | 
			
		||||
    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
 | 
			
		||||
    Recon(result, Uchi);					\
 | 
			
		||||
  }								\
 | 
			
		||||
  synchronise();						
 | 
			
		||||
  acceleratorSynchronise();						
 | 
			
		||||
 | 
			
		||||
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
 | 
			
		||||
  SE = st.GetEntry(ptype, Dir, sF);				\
 | 
			
		||||
@@ -89,7 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 | 
			
		||||
    Recon(result, Uchi);					\
 | 
			
		||||
    nmu++;							\
 | 
			
		||||
  }								\
 | 
			
		||||
  synchronise();						
 | 
			
		||||
  acceleratorSynchronise();						
 | 
			
		||||
 | 
			
		||||
#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
 | 
			
		||||
    if (SE->_is_local ) {					\
 | 
			
		||||
@@ -99,7 +101,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 | 
			
		||||
    } else {							\
 | 
			
		||||
      chi = coalescedRead(buf[SE->_offset],lane);		\
 | 
			
		||||
    }								\
 | 
			
		||||
    synchronise();						\
 | 
			
		||||
    acceleratorSynchronise();					\
 | 
			
		||||
    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
 | 
			
		||||
    Recon(result, Uchi);					
 | 
			
		||||
 | 
			
		||||
@@ -126,7 +128,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
  const int Nsimd = SiteHalfSpinor::Nsimd();
 | 
			
		||||
  const int lane=SIMTlane(Nsimd);
 | 
			
		||||
  const int lane=acceleratorSIMTlane(Nsimd);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
 | 
			
		||||
@@ -153,7 +155,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
 | 
			
		||||
  int ptype;
 | 
			
		||||
 | 
			
		||||
  const int Nsimd = SiteHalfSpinor::Nsimd();
 | 
			
		||||
  const int lane=SIMTlane(Nsimd);
 | 
			
		||||
  const int lane=acceleratorSIMTlane(Nsimd);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
 | 
			
		||||
  GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
 | 
			
		||||
@@ -181,7 +183,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
  const int Nsimd = SiteHalfSpinor::Nsimd();
 | 
			
		||||
  const int lane=SIMTlane(Nsimd);
 | 
			
		||||
  const int lane=acceleratorSIMTlane(Nsimd);
 | 
			
		||||
 | 
			
		||||
  result=Zero();
 | 
			
		||||
  GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
 | 
			
		||||
@@ -203,7 +205,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField
 | 
			
		||||
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
 | 
			
		||||
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
 | 
			
		||||
  const int Nsimd = SiteHalfSpinor::Nsimd();
 | 
			
		||||
  const int lane=SIMTlane(Nsimd);
 | 
			
		||||
  const int lane=acceleratorSIMTlane(Nsimd);
 | 
			
		||||
 | 
			
		||||
  calcHalfSpinor chi;
 | 
			
		||||
  //  calcHalfSpinor *chi_p;
 | 
			
		||||
@@ -239,7 +241,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi
 | 
			
		||||
  int ptype;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
  const int Nsimd = SiteHalfSpinor::Nsimd();
 | 
			
		||||
  const int lane=SIMTlane(Nsimd);
 | 
			
		||||
  const int lane=acceleratorSIMTlane(Nsimd);
 | 
			
		||||
  result=Zero();
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
 | 
			
		||||
@@ -270,7 +272,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
 | 
			
		||||
  int ptype;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
  const int Nsimd = SiteHalfSpinor::Nsimd();
 | 
			
		||||
  const int lane=SIMTlane(Nsimd);
 | 
			
		||||
  const int lane=acceleratorSIMTlane(Nsimd);
 | 
			
		||||
  result=Zero();
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
 | 
			
		||||
  GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
 | 
			
		||||
@@ -300,7 +302,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField
 | 
			
		||||
  StencilEntry *SE;							\
 | 
			
		||||
  int ptype;								\
 | 
			
		||||
  const int Nsimd = SiteHalfSpinor::Nsimd();				\
 | 
			
		||||
  const int lane=SIMTlane(Nsimd);					\
 | 
			
		||||
  const int lane=acceleratorSIMTlane(Nsimd);					\
 | 
			
		||||
									\
 | 
			
		||||
  SE = st.GetEntry(ptype, dir, sF);					\
 | 
			
		||||
  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
 | 
			
		||||
@@ -328,7 +330,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int ptype;
 | 
			
		||||
  const int Nsimd = SiteHalfSpinor::Nsimd();
 | 
			
		||||
  const int lane=SIMTlane(Nsimd);
 | 
			
		||||
  const int lane=acceleratorSIMTlane(Nsimd);
 | 
			
		||||
 | 
			
		||||
  SE = st.GetEntry(ptype, dir, sF);
 | 
			
		||||
  GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
 | 
			
		||||
@@ -346,30 +348,30 @@ template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 | 
			
		||||
				      int Nsite, const FermionField &in, std::vector<FermionField> &out) 
 | 
			
		||||
{
 | 
			
		||||
   auto U_v   = U.View();
 | 
			
		||||
   auto in_v  = in.View();
 | 
			
		||||
   auto st_v  = st.View();
 | 
			
		||||
   autoView(U_v  ,U,AcceleratorRead);
 | 
			
		||||
   autoView(in_v ,in,AcceleratorRead);
 | 
			
		||||
   autoView(st_v ,st,AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
   auto out_Xm = out[0].View();
 | 
			
		||||
   auto out_Ym = out[1].View();
 | 
			
		||||
   auto out_Zm = out[2].View();
 | 
			
		||||
   auto out_Tm = out[3].View();
 | 
			
		||||
   auto out_Xp = out[4].View();
 | 
			
		||||
   auto out_Yp = out[5].View();
 | 
			
		||||
   auto out_Zp = out[6].View();
 | 
			
		||||
   auto out_Tp = out[7].View();
 | 
			
		||||
 | 
			
		||||
   accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
 | 
			
		||||
   autoView(out_Xm,out[0],AcceleratorWrite);
 | 
			
		||||
   autoView(out_Ym,out[1],AcceleratorWrite);
 | 
			
		||||
   autoView(out_Zm,out[2],AcceleratorWrite);
 | 
			
		||||
   autoView(out_Tm,out[3],AcceleratorWrite);
 | 
			
		||||
   autoView(out_Xp,out[4],AcceleratorWrite);
 | 
			
		||||
   autoView(out_Yp,out[5],AcceleratorWrite);
 | 
			
		||||
   autoView(out_Zp,out[6],AcceleratorWrite);
 | 
			
		||||
   autoView(out_Tp,out[7],AcceleratorWrite);
 | 
			
		||||
   auto CBp=st.CommBuf();
 | 
			
		||||
   accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
 | 
			
		||||
      int sU=sss/Ls;				
 | 
			
		||||
      int sF =sss;				
 | 
			
		||||
      DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0);
 | 
			
		||||
      DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1);
 | 
			
		||||
      DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2);
 | 
			
		||||
      DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3);
 | 
			
		||||
      DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4);
 | 
			
		||||
      DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5);
 | 
			
		||||
      DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6);
 | 
			
		||||
      DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7);
 | 
			
		||||
      DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
 | 
			
		||||
      DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
 | 
			
		||||
      DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
 | 
			
		||||
      DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
 | 
			
		||||
      DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
 | 
			
		||||
      DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
 | 
			
		||||
      DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
 | 
			
		||||
      DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
 | 
			
		||||
   });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -381,17 +383,18 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 | 
			
		||||
  assert(dirdisp<=7);
 | 
			
		||||
  assert(dirdisp>=0);
 | 
			
		||||
 | 
			
		||||
   auto U_v   = U.View();
 | 
			
		||||
   auto in_v  = in.View();
 | 
			
		||||
   auto out_v = out.View();
 | 
			
		||||
   auto st_v  = st.View();
 | 
			
		||||
   autoView(U_v  ,U  ,AcceleratorRead);
 | 
			
		||||
   autoView(in_v ,in ,AcceleratorRead);
 | 
			
		||||
   autoView(out_v,out,AcceleratorWrite);
 | 
			
		||||
   autoView(st_v ,st ,AcceleratorRead);
 | 
			
		||||
   auto CBp=st.CommBuf();			
 | 
			
		||||
#define LoopBody(Dir)				\
 | 
			
		||||
   case Dir :			\
 | 
			
		||||
     accelerator_forNB(ss,Nsite,Simd::Nsimd(),{	\
 | 
			
		||||
   case Dir :					\
 | 
			
		||||
     accelerator_for(ss,Nsite,Simd::Nsimd(),{	\
 | 
			
		||||
       for(int s=0;s<Ls;s++){			\
 | 
			
		||||
	 int sU=ss;				\
 | 
			
		||||
	 int sF = s+Ls*sU;						\
 | 
			
		||||
	 DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\
 | 
			
		||||
	 DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
 | 
			
		||||
       }							       \
 | 
			
		||||
       });							       \
 | 
			
		||||
     break;
 | 
			
		||||
@@ -435,26 +438,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 | 
			
		||||
				     int Ls, int Nsite, const FermionField &in, FermionField &out,
 | 
			
		||||
				     int interior,int exterior) 
 | 
			
		||||
{
 | 
			
		||||
    auto U_v   =   U.View();
 | 
			
		||||
    auto in_v  =  in.View();
 | 
			
		||||
    auto out_v = out.View();
 | 
			
		||||
    auto st_v  =  st.View();
 | 
			
		||||
    autoView(U_v  ,  U,AcceleratorRead);
 | 
			
		||||
    autoView(in_v , in,AcceleratorRead);
 | 
			
		||||
    autoView(out_v,out,AcceleratorWrite);
 | 
			
		||||
    autoView(st_v , st,AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
   if( interior && exterior ) { 
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 | 
			
		||||
#endif
 | 
			
		||||
   } else if( interior ) {
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 | 
			
		||||
#endif
 | 
			
		||||
   } else if( exterior ) { 
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 | 
			
		||||
#endif
 | 
			
		||||
@@ -466,26 +469,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 | 
			
		||||
					  int Ls, int Nsite, const FermionField &in, FermionField &out,
 | 
			
		||||
					  int interior,int exterior) 
 | 
			
		||||
  {
 | 
			
		||||
    auto U_v   = U.View();
 | 
			
		||||
    auto in_v  = in.View();
 | 
			
		||||
    auto out_v = out.View();
 | 
			
		||||
    auto st_v  = st.View();
 | 
			
		||||
    autoView(U_v  ,U,AcceleratorRead);
 | 
			
		||||
    autoView(in_v ,in,AcceleratorRead);
 | 
			
		||||
    autoView(out_v,out,AcceleratorWrite);
 | 
			
		||||
    autoView(st_v ,st,AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
   if( interior && exterior ) { 
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 | 
			
		||||
#endif
 | 
			
		||||
   } else if( interior ) {
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 | 
			
		||||
#endif
 | 
			
		||||
   } else if( exterior ) { 
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 | 
			
		||||
#endif
 | 
			
		||||
@@ -493,5 +496,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 | 
			
		||||
   assert(0 && " Kernel optimisation case not covered ");
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#undef KERNEL_CALLNB
 | 
			
		||||
#undef KERNEL_CALL
 | 
			
		||||
#undef ASM_CALL
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,36 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi, Peter Boyle
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution
 | 
			
		||||
directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
			   /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
 | 
			
		||||
const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
@@ -0,0 +1,37 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi, Peter Boyle
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution
 | 
			
		||||
directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
			   /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h>
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
#include "impl.h"
 | 
			
		||||
template class NaiveStaggeredFermion<IMPLEMENTATION>; 
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
@@ -0,0 +1 @@
 | 
			
		||||
../NaiveStaggeredFermionInstantiation.cc.master
 | 
			
		||||
@@ -0,0 +1 @@
 | 
			
		||||
../NaiveStaggeredFermionInstantiation.cc.master
 | 
			
		||||
@@ -88,6 +88,7 @@ done
 | 
			
		||||
CC_LIST=" \
 | 
			
		||||
  ImprovedStaggeredFermion5DInstantiation \
 | 
			
		||||
  ImprovedStaggeredFermionInstantiation \
 | 
			
		||||
  NaiveStaggeredFermionInstantiation \
 | 
			
		||||
  StaggeredKernelsInstantiation "
 | 
			
		||||
 | 
			
		||||
for impl in $STAG_IMPL_LIST
 | 
			
		||||
 
 | 
			
		||||
@@ -86,9 +86,9 @@ public:
 | 
			
		||||
 | 
			
		||||
  // Move this elsewhere? FIXME
 | 
			
		||||
  static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W
 | 
			
		||||
    auto U_v = U.View();
 | 
			
		||||
    auto W_v = W.View();
 | 
			
		||||
    thread_for( ss, U.Grid()->oSites(), {
 | 
			
		||||
    autoView(U_v,U,AcceleratorWrite);
 | 
			
		||||
    autoView(W_v,W,AcceleratorRead);
 | 
			
		||||
    accelerator_for( ss, U.Grid()->oSites(), 1, {
 | 
			
		||||
      U_v[ss](mu) = U_v[ss](mu) + W_v[ss]();
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
@@ -131,15 +131,14 @@ public:
 | 
			
		||||
    //static std::chrono::duration<double> diff;
 | 
			
		||||
 | 
			
		||||
    //auto start = std::chrono::high_resolution_clock::now();
 | 
			
		||||
    auto U_v = U.View();
 | 
			
		||||
    auto P_v = P.View();
 | 
			
		||||
    thread_for(ss, P.Grid()->oSites(),{
 | 
			
		||||
    autoView(U_v,U,AcceleratorWrite);
 | 
			
		||||
    autoView(P_v,P,AcceleratorRead);
 | 
			
		||||
    accelerator_for(ss, P.Grid()->oSites(),1,{
 | 
			
		||||
      for (int mu = 0; mu < Nd; mu++) {
 | 
			
		||||
        U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
    
 | 
			
		||||
    //auto end = std::chrono::high_resolution_clock::now();
 | 
			
		||||
   //auto end = std::chrono::high_resolution_clock::now();
 | 
			
		||||
   // diff += end - start;
 | 
			
		||||
   // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,13 @@
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#define CPS_MD_TIME 
 | 
			
		||||
 | 
			
		||||
#ifdef CPS_MD_TIME
 | 
			
		||||
#define HMC_MOMENTUM_DENOMINATOR (2.0)
 | 
			
		||||
#else
 | 
			
		||||
#define HMC_MOMENTUM_DENOMINATOR (1.0)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
template <class S>
 | 
			
		||||
@@ -20,7 +28,9 @@ public:
 | 
			
		||||
  typedef Field              PropagatorField;
 | 
			
		||||
    
 | 
			
		||||
  static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
 | 
			
		||||
    RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
 | 
			
		||||
    gaussian(pRNG, P);
 | 
			
		||||
    P *= scale; 
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static inline Field projectForce(Field& P){return P;}
 | 
			
		||||
@@ -66,7 +76,7 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
    
 | 
			
		||||
  static void FreePropagator(const Field &in, Field &out,
 | 
			
		||||
			     const Field &momKernel)
 | 
			
		||||
           const Field &momKernel)
 | 
			
		||||
  {
 | 
			
		||||
    FFT   fft((GridCartesian *)in.Grid());
 | 
			
		||||
    Field inFT(in.Grid());
 | 
			
		||||
@@ -139,14 +149,17 @@ public:
 | 
			
		||||
 | 
			
		||||
    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
 | 
			
		||||
    {
 | 
			
		||||
      RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
 | 
			
		||||
#ifndef USE_FFT_ACCELERATION
 | 
			
		||||
    Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
 | 
			
		||||
    
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
      Field Pgaussian(P.Grid()), Pp(P.Grid());
 | 
			
		||||
      ComplexField p2(P.Grid()); p2 = zero;
 | 
			
		||||
      RealD M = FFT_MASS;
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
 | 
			
		||||
 | 
			
		||||
      FFT theFFT((GridCartesian*)P.Grid());
 | 
			
		||||
@@ -156,17 +169,17 @@ public:
 | 
			
		||||
      p2 = sqrt(p2);
 | 
			
		||||
      Pp *= p2;
 | 
			
		||||
      theFFT.FFT_all_dim(P, Pp, FFT::backward);
 | 
			
		||||
 | 
			
		||||
#endif //USE_FFT_ACCELERATION
 | 
			
		||||
      P *= scale; 
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static inline Field projectForce(Field& P) {return P;}
 | 
			
		||||
  static inline Field projectForce(Field& P) {return Ta(P);}
 | 
			
		||||
 | 
			
		||||
    static inline void update_field(Field &P, Field &U, double ep)
 | 
			
		||||
    {
 | 
			
		||||
#ifndef USE_FFT_ACCELERATION
 | 
			
		||||
      double t0=usecond(); 
 | 
			
		||||
    U += P*ep;
 | 
			
		||||
      U += P*ep;
 | 
			
		||||
      double t1=usecond();
 | 
			
		||||
      double total_time = (t1-t0)/1e6;
 | 
			
		||||
      std::cout << GridLogIntegrator << "Total time for updating field (s)       : " << total_time << std::endl; 
 | 
			
		||||
 
 | 
			
		||||
@@ -89,8 +89,8 @@ public:
 | 
			
		||||
    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
    auto p_v = p.View();
 | 
			
		||||
    auto action_v = action.View();
 | 
			
		||||
    autoView( p_v      , p, CpuRead);
 | 
			
		||||
    autoView( action_v , action, CpuWrite);
 | 
			
		||||
    for (int mu = 0; mu < Ndim; mu++)
 | 
			
		||||
    {
 | 
			
		||||
      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 | 
			
		||||
@@ -146,8 +146,8 @@ public:
 | 
			
		||||
    for (int point = 0; point < npoint; point++)
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      auto p_v = p.View();
 | 
			
		||||
      auto force_v = force.View();
 | 
			
		||||
      autoView( p_v , p, CpuRead);
 | 
			
		||||
      autoView( force_v , force, CpuWrite);
 | 
			
		||||
            
 | 
			
		||||
      int permute_type;
 | 
			
		||||
      StencilEntry *SE;
 | 
			
		||||
 
 | 
			
		||||
@@ -80,10 +80,11 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,
 | 
			
		||||
 | 
			
		||||
static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,   
 | 
			
		||||
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); 
 | 
			
		||||
//static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,   
 | 
			
		||||
//                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("BiCGSTAB"); 
 | 
			
		||||
//static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,   
 | 
			
		||||
//                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 
 | 
			
		||||
 | 
			
		||||
static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,   
 | 
			
		||||
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB"); 
 | 
			
		||||
static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,   
 | 
			
		||||
                  HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); 
 | 
			
		||||
 | 
			
		||||
// add the staggered, scalar versions here
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -49,7 +49,7 @@ public:
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
  const unsigned int smearingLevels;
 | 
			
		||||
  Smear_Stout<Gimpl> StoutSmearing;
 | 
			
		||||
  Smear_Stout<Gimpl> *StoutSmearing;
 | 
			
		||||
  std::vector<GaugeField> SmearedSet;
 | 
			
		||||
 | 
			
		||||
  // Member functions
 | 
			
		||||
@@ -72,7 +72,7 @@ private:
 | 
			
		||||
      previous_u = *ThinLinks;
 | 
			
		||||
      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
 | 
			
		||||
      {
 | 
			
		||||
        StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
 | 
			
		||||
        StoutSmearing->smear(SmearedSet[smearLvl], previous_u);
 | 
			
		||||
        previous_u = SmearedSet[smearLvl];
 | 
			
		||||
 | 
			
		||||
        // For debug purposes
 | 
			
		||||
@@ -93,7 +93,7 @@ private:
 | 
			
		||||
    GaugeLinkField SigmaKPrime_mu(grid);
 | 
			
		||||
    GaugeLinkField GaugeKmu(grid), Cmu(grid);
 | 
			
		||||
 | 
			
		||||
    StoutSmearing.BaseSmear(C, GaugeK);
 | 
			
		||||
    StoutSmearing->BaseSmear(C, GaugeK);
 | 
			
		||||
    SigmaK = Zero();
 | 
			
		||||
    iLambda = Zero();
 | 
			
		||||
 | 
			
		||||
@@ -107,7 +107,7 @@ private:
 | 
			
		||||
      pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
 | 
			
		||||
      pokeLorentz(iLambda, iLambda_mu, mu);
 | 
			
		||||
    }
 | 
			
		||||
    StoutSmearing.derivative(SigmaK, iLambda,
 | 
			
		||||
    StoutSmearing->derivative(SigmaK, iLambda,
 | 
			
		||||
                             GaugeK);  // derivative of SmearBase
 | 
			
		||||
    return SigmaK;
 | 
			
		||||
  }
 | 
			
		||||
@@ -144,14 +144,14 @@ private:
 | 
			
		||||
    // Exponential
 | 
			
		||||
    iQ2 = iQ * iQ;
 | 
			
		||||
    iQ3 = iQ * iQ2;
 | 
			
		||||
    StoutSmearing.set_uw(u, w, iQ2, iQ3);
 | 
			
		||||
    StoutSmearing.set_fj(f0, f1, f2, u, w);
 | 
			
		||||
    StoutSmearing->set_uw(u, w, iQ2, iQ3);
 | 
			
		||||
    StoutSmearing->set_fj(f0, f1, f2, u, w);
 | 
			
		||||
    e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
 | 
			
		||||
 | 
			
		||||
    // Getting B1, B2, Gamma and Lambda
 | 
			
		||||
    // simplify this part, reduntant calculations in set_fj
 | 
			
		||||
    xi0 = StoutSmearing.func_xi0(w);
 | 
			
		||||
    xi1 = StoutSmearing.func_xi1(w);
 | 
			
		||||
    xi0 = StoutSmearing->func_xi0(w);
 | 
			
		||||
    xi1 = StoutSmearing->func_xi1(w);
 | 
			
		||||
    u2 = u * u;
 | 
			
		||||
    w2 = w * w;
 | 
			
		||||
    cosw = cos(w);
 | 
			
		||||
@@ -219,7 +219,7 @@ public:
 | 
			
		||||
  /* Standard constructor */
 | 
			
		||||
  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
 | 
			
		||||
                       Smear_Stout<Gimpl>& Stout)
 | 
			
		||||
      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL)
 | 
			
		||||
      : smearingLevels(Nsmear), StoutSmearing(&Stout), ThinLinks(NULL)
 | 
			
		||||
  {
 | 
			
		||||
    for (unsigned int i = 0; i < smearingLevels; ++i)
 | 
			
		||||
      SmearedSet.push_back(*(new GaugeField(UGrid)));
 | 
			
		||||
@@ -227,7 +227,7 @@ public:
 | 
			
		||||
 | 
			
		||||
  /*! For just thin links */
 | 
			
		||||
  SmearedConfiguration()
 | 
			
		||||
    : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}
 | 
			
		||||
    : smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}
 | 
			
		||||
 | 
			
		||||
  // attach the smeared routines to the thin links U and fill the smeared set
 | 
			
		||||
  void set_Field(GaugeField &U)
 | 
			
		||||
 
 | 
			
		||||
@@ -185,13 +185,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 | 
			
		||||
 | 
			
		||||
	for(int i=0;i<Lblock;i++){
 | 
			
		||||
 | 
			
		||||
	  auto lhs_v = lhs_wi[i].View();
 | 
			
		||||
	  // Recreate view potentially expensive outside fo UVM mode
 | 
			
		||||
	  autoView(lhs_v,lhs_wi[i],CpuRead);
 | 
			
		||||
	  auto left = conjugate(lhs_v[ss]);
 | 
			
		||||
 | 
			
		||||
	  for(int j=0;j<Rblock;j++){
 | 
			
		||||
 | 
			
		||||
	    SpinMatrix_v vv;
 | 
			
		||||
	    auto rhs_v = rhs_vj[j].View();
 | 
			
		||||
	    // Recreate view potentially expensive outside fo UVM mode
 | 
			
		||||
	    autoView(rhs_v,rhs_vj[j],CpuRead);
 | 
			
		||||
	    auto right = rhs_v[ss];
 | 
			
		||||
	    for(int s1=0;s1<Ns;s1++){
 | 
			
		||||
	    for(int s2=0;s2<Ns;s2++){
 | 
			
		||||
@@ -204,11 +205,10 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 | 
			
		||||
	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
 | 
			
		||||
	    for ( int m=0;m<Nmom;m++){
 | 
			
		||||
	      int idx = m+base;
 | 
			
		||||
	      auto mom_v = mom[m].View();
 | 
			
		||||
	      autoView(mom_v,mom[m],CpuRead);
 | 
			
		||||
	      auto phase = mom_v[ss];
 | 
			
		||||
	      mac(&lvSum[idx],&vv,&phase);
 | 
			
		||||
	    }
 | 
			
		||||
	  
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
@@ -371,7 +371,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
 | 
			
		||||
 | 
			
		||||
	for(int i=0;i<Lblock;i++){
 | 
			
		||||
 | 
			
		||||
	  auto wi_v = wi[i].View();
 | 
			
		||||
	  autoView(wi_v,wi[i],CpuRead);
 | 
			
		||||
	  auto w = conjugate(wi_v[ss]);
 | 
			
		||||
	  if (g5) {
 | 
			
		||||
	    w()(2)(0) = - w()(2)(0);
 | 
			
		||||
@@ -383,7 +383,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
 | 
			
		||||
	  }
 | 
			
		||||
	  for(int j=0;j<Rblock;j++){
 | 
			
		||||
	    
 | 
			
		||||
	    auto vj_v=vj[j].View();
 | 
			
		||||
	    autoView(vj_v,vj[j],CpuRead);
 | 
			
		||||
	    auto v  = vj_v[ss];
 | 
			
		||||
	    auto vv = v()(0)(0);
 | 
			
		||||
 | 
			
		||||
@@ -518,12 +518,12 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
 | 
			
		||||
 | 
			
		||||
	for(int i=0;i<Lblock;i++){
 | 
			
		||||
 | 
			
		||||
	  auto wi_v = wi[i].View();
 | 
			
		||||
	  autoView(wi_v,wi[i],CpuRead);
 | 
			
		||||
	  auto w = conjugate(wi_v[ss]);
 | 
			
		||||
 | 
			
		||||
	  for(int j=0;j<Rblock;j++){
 | 
			
		||||
	    
 | 
			
		||||
	    auto vj_v = vj[j].View();
 | 
			
		||||
 | 
			
		||||
	    autoView(vj_v,vj[j],CpuRead);
 | 
			
		||||
	    auto v = vj_v[ss];
 | 
			
		||||
 | 
			
		||||
	    auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
 | 
			
		||||
@@ -544,7 +544,7 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
 | 
			
		||||
	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
 | 
			
		||||
	    for ( int m=0;m<Nmom;m++){
 | 
			
		||||
	      int idx = m+base;
 | 
			
		||||
	      auto mom_v = mom[m].View();
 | 
			
		||||
	      autoView(mom_v,mom[m],CpuRead);
 | 
			
		||||
	      auto phase = mom_v[ss];
 | 
			
		||||
	      mac(&lvSum[idx],&vv,&phase()()());
 | 
			
		||||
	    }
 | 
			
		||||
@@ -730,13 +730,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
 | 
			
		||||
 | 
			
		||||
            for(int i=0;i<Lblock;i++)
 | 
			
		||||
            {
 | 
			
		||||
  	        auto wi_v = lhs_wi[i].View();
 | 
			
		||||
  	        autoView(wi_v,lhs_wi[i],CpuRead);
 | 
			
		||||
                auto left = conjugate(wi_v[ss]);
 | 
			
		||||
 | 
			
		||||
                for(int j=0;j<Rblock;j++)
 | 
			
		||||
                {
 | 
			
		||||
                    SpinMatrix_v vv;
 | 
			
		||||
		    auto vj_v  = rhs_vj[j].View();
 | 
			
		||||
		    autoView(vj_v,rhs_vj[j],CpuRead);
 | 
			
		||||
                    auto right = vj_v[ss];
 | 
			
		||||
 | 
			
		||||
                    for(int s1=0;s1<Ns;s1++)
 | 
			
		||||
@@ -752,8 +752,8 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
 | 
			
		||||
 | 
			
		||||
                    for ( int m=0;m<Nem;m++)
 | 
			
		||||
                    {
 | 
			
		||||
  		        auto emB0_v = emB0[m].View();
 | 
			
		||||
  		        auto emB1_v = emB1[m].View();
 | 
			
		||||
  		        autoView(emB0_v,emB0[m],CpuRead);
 | 
			
		||||
		        autoView(emB1_v,emB1[m],CpuRead);
 | 
			
		||||
                        int idx  = m+base;
 | 
			
		||||
                        auto b0  = emB0_v[ss];
 | 
			
		||||
                        auto b1  = emB1_v[ss];
 | 
			
		||||
@@ -1014,21 +1014,21 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
 | 
			
		||||
    for(int d_o=0;d_o<N_d;d_o+=d_unroll){
 | 
			
		||||
      for(int t=0;t<N_t;t++){
 | 
			
		||||
      for(int s=0;s<N_s;s++){
 | 
			
		||||
  auto vs_v = vs[s].View();
 | 
			
		||||
  auto tmp1 = vs_v[ss];
 | 
			
		||||
  vobj tmp2 = Zero();
 | 
			
		||||
  vobj tmp3 = Zero();
 | 
			
		||||
  for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
 | 
			
		||||
    auto vd_v = vd[d].View();
 | 
			
		||||
    Scalar_v coeff = WW_sd(t,s,d);
 | 
			
		||||
    tmp3 = conjugate(vd_v[ss]);
 | 
			
		||||
    mac(&tmp2, &coeff, &tmp3);
 | 
			
		||||
  }
 | 
			
		||||
	autoView(vs_v,vs[s],CpuRead);
 | 
			
		||||
	auto tmp1 = vs_v[ss];
 | 
			
		||||
	vobj tmp2 = Zero();
 | 
			
		||||
	vobj tmp3 = Zero();
 | 
			
		||||
	for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
 | 
			
		||||
	  autoView(vd_v,vd[d],CpuRead);
 | 
			
		||||
	  Scalar_v coeff = WW_sd(t,s,d);
 | 
			
		||||
	  tmp3 = conjugate(vd_v[ss]);
 | 
			
		||||
	  mac(&tmp2, &coeff, &tmp3);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
  //////////////////////////
 | 
			
		||||
  // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
 | 
			
		||||
  //////////////////////////
 | 
			
		||||
  OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
 | 
			
		||||
	//////////////////////////
 | 
			
		||||
	// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
 | 
			
		||||
	//////////////////////////
 | 
			
		||||
	OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
 | 
			
		||||
 | 
			
		||||
      }}
 | 
			
		||||
    }
 | 
			
		||||
@@ -1067,21 +1067,20 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
 | 
			
		||||
    thread_for(ss,grid->oSites(),{
 | 
			
		||||
      for(int d_o=0;d_o<N_d;d_o+=d_unroll){
 | 
			
		||||
        for(int s=0;s<N_s;s++){
 | 
			
		||||
    auto vs_v = vs[s].View();
 | 
			
		||||
    auto tmp1 = vs_v[ss];
 | 
			
		||||
    vobj tmp2 = Zero();
 | 
			
		||||
    vobj tmp3 = Zero();
 | 
			
		||||
    for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
 | 
			
		||||
      auto vd_v = vd[d].View();
 | 
			
		||||
      Scalar_v coeff = buf(s,d);
 | 
			
		||||
      tmp3 = conjugate(vd_v[ss]);
 | 
			
		||||
      mac(&tmp2, &coeff, &tmp3);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //////////////////////////
 | 
			
		||||
    // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
 | 
			
		||||
    //////////////////////////
 | 
			
		||||
    OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
 | 
			
		||||
	  autoView(vs_v,vs[s],CpuRead);
 | 
			
		||||
	  auto tmp1 = vs_v[ss];
 | 
			
		||||
	  vobj tmp2 = Zero();
 | 
			
		||||
	  vobj tmp3 = Zero();
 | 
			
		||||
	  for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
 | 
			
		||||
	    autoView(vd_v,vd[d],CpuRead);
 | 
			
		||||
	    Scalar_v coeff = buf(s,d);
 | 
			
		||||
	    tmp3 = conjugate(vd_v[ss]);
 | 
			
		||||
	    mac(&tmp2, &coeff, &tmp3);
 | 
			
		||||
	  }
 | 
			
		||||
	  //////////////////////////
 | 
			
		||||
	  // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
 | 
			
		||||
	  //////////////////////////
 | 
			
		||||
	  OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
 | 
			
		||||
      }}
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
@@ -1093,7 +1092,7 @@ inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
 | 
			
		||||
                                             const vobj &rhs,
 | 
			
		||||
                                             const int Ns, const int ss)
 | 
			
		||||
{
 | 
			
		||||
  auto WWVV_v = WWVV.View();
 | 
			
		||||
  autoView(WWVV_v,WWVV,CpuWrite);
 | 
			
		||||
  for (int s1 = 0; s1 < Ns; s1++){
 | 
			
		||||
    for (int s2 = 0; s2 < Ns; s2++){
 | 
			
		||||
      WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
 | 
			
		||||
@@ -1120,33 +1119,39 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV
 | 
			
		||||
  assert(gamma0.size()==gamma1.size());
 | 
			
		||||
  int Ng = gamma0.size();
 | 
			
		||||
 | 
			
		||||
  // Make device accessible copy
 | 
			
		||||
  Vector<Gamma> Gamma0v (Ng);
 | 
			
		||||
  Vector<Gamma> Gamma1v (Ng);
 | 
			
		||||
  Gamma *Gamma0 = & Gamma0v[0];
 | 
			
		||||
  Gamma *Gamma1 = & Gamma1v[0];
 | 
			
		||||
  for(int g=0;g<Ng;g++) {
 | 
			
		||||
    Gamma0[g]=gamma0[g];
 | 
			
		||||
    Gamma1[g]=gamma1[g];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  GridBase *grid = WWVV0.Grid();
 | 
			
		||||
 | 
			
		||||
  auto WWVV0_v = WWVV0.View();
 | 
			
		||||
  auto WWVV1_v = WWVV1.View();
 | 
			
		||||
  auto O_trtr_v= O_trtr.View();
 | 
			
		||||
  auto O_fig8_v= O_fig8.View();
 | 
			
		||||
  thread_for(ss,grid->oSites(),{
 | 
			
		||||
  typedef typename ComplexField::vector_object vobj;
 | 
			
		||||
  autoView(WWVV0_v , WWVV0,AcceleratorRead);
 | 
			
		||||
  autoView(WWVV1_v , WWVV1,AcceleratorRead);
 | 
			
		||||
  autoView(O_trtr_v, O_trtr,AcceleratorWrite);
 | 
			
		||||
  autoView(O_fig8_v, O_fig8,AcceleratorWrite);
 | 
			
		||||
  accelerator_for(ss,grid->oSites(),vobj::Nsimd(),{
 | 
			
		||||
 | 
			
		||||
    typedef typename ComplexField::vector_object vobj;
 | 
			
		||||
 | 
			
		||||
    vobj v_trtr;
 | 
			
		||||
    vobj v_fig8;
 | 
			
		||||
 | 
			
		||||
    auto VV0 = WWVV0_v[ss];
 | 
			
		||||
    auto VV1 = WWVV1_v[ss];
 | 
			
		||||
    auto VV0 = WWVV0_v(ss);
 | 
			
		||||
    auto VV1 = WWVV1_v(ss);
 | 
			
		||||
    
 | 
			
		||||
    for(int g=0;g<Ng;g++){
 | 
			
		||||
 | 
			
		||||
      v_trtr = trace(VV0 * gamma0[g])* trace(VV1*gamma1[g]);
 | 
			
		||||
      v_fig8 = trace(VV0 * gamma0[g] * VV1 * gamma1[g]);
 | 
			
		||||
      auto v_trtr = trace(VV0 * gamma0[g])* trace(VV1*gamma1[g]);
 | 
			
		||||
      auto v_fig8 = trace(VV0 * gamma0[g] * VV1 * gamma1[g]);
 | 
			
		||||
 | 
			
		||||
      if ( g==0 ) {
 | 
			
		||||
	O_trtr_v[ss] = v_trtr; 
 | 
			
		||||
	O_fig8_v[ss] = v_fig8;
 | 
			
		||||
	coalescedWrite(O_trtr_v[ss], v_trtr); 
 | 
			
		||||
	coalescedWrite(O_fig8_v[ss], v_fig8);
 | 
			
		||||
      } else { 
 | 
			
		||||
	O_trtr_v[ss]+= v_trtr; 
 | 
			
		||||
	O_fig8_v[ss]+= v_fig8;
 | 
			
		||||
	coalescedWrite(O_trtr_v[ss], O_trtr_v(ss)+v_trtr); 
 | 
			
		||||
	coalescedWrite(O_fig8_v[ss], O_fig8_v(ss)+v_fig8);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
@@ -1166,25 +1171,36 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
 | 
			
		||||
 | 
			
		||||
  GridBase *grid = WWVV0.Grid();
 | 
			
		||||
 | 
			
		||||
  auto WWVV0_v = WWVV0.View();
 | 
			
		||||
  auto WWVV1_v = WWVV1.View();
 | 
			
		||||
  auto O_trtr_v= O_trtr.View();
 | 
			
		||||
  auto O_fig8_v= O_fig8.View();
 | 
			
		||||
  // Make device accessible copy
 | 
			
		||||
  Vector<Gamma> Gamma0v (Ng);
 | 
			
		||||
  Vector<Gamma> Gamma1v (Ng);
 | 
			
		||||
  Gamma *Gamma0 = & Gamma0v[0];
 | 
			
		||||
  Gamma *Gamma1 = & Gamma1v[0];
 | 
			
		||||
  for(int g=0;g<Ng;g++) {
 | 
			
		||||
    Gamma0[g]=gamma0[g];
 | 
			
		||||
    Gamma1[g]=gamma1[g];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  thread_for(ss,grid->oSites(),{
 | 
			
		||||
  autoView( WWVV0_v , WWVV0,AcceleratorRead);
 | 
			
		||||
  autoView( WWVV1_v , WWVV1,AcceleratorRead);
 | 
			
		||||
  autoView( O_trtr_v, O_trtr,AcceleratorWrite);
 | 
			
		||||
  autoView( O_fig8_v, O_fig8,AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
    typedef typename ComplexField::vector_object vobj;
 | 
			
		||||
  typedef typename ComplexField::vector_object vobj;
 | 
			
		||||
  accelerator_for(ss,grid->oSites(),vobj::Nsimd(),{
 | 
			
		||||
 | 
			
		||||
    auto VV0 = WWVV0_v(ss);
 | 
			
		||||
    auto VV1 = WWVV1_v(ss);
 | 
			
		||||
 | 
			
		||||
    typedef decltype(trace(VV0)) scalar;
 | 
			
		||||
 | 
			
		||||
    auto VV0 = WWVV0_v[ss];
 | 
			
		||||
    auto VV1 = WWVV1_v[ss];
 | 
			
		||||
    
 | 
			
		||||
    for(int g=0;g<Ng;g++){
 | 
			
		||||
 | 
			
		||||
      auto VV0G = VV0 * gamma0[g];  // Spin multiply
 | 
			
		||||
      auto VV1G = VV1 * gamma1[g];
 | 
			
		||||
 | 
			
		||||
      vobj v_trtr=Zero();
 | 
			
		||||
      vobj v_fig8=Zero();
 | 
			
		||||
      scalar v_trtr=Zero();
 | 
			
		||||
      scalar v_fig8=Zero();
 | 
			
		||||
 | 
			
		||||
      /////////////////////////////////////////
 | 
			
		||||
      // Colour mixed
 | 
			
		||||
@@ -1198,7 +1214,7 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
 | 
			
		||||
      // Wick1 [ spin TR TR ]
 | 
			
		||||
      //
 | 
			
		||||
      //    (VV0*G0)_ss,ba .  (VV1*G1)_tt,ab
 | 
			
		||||
       //
 | 
			
		||||
      //
 | 
			
		||||
      // Wick2 [ spin fig8 ]
 | 
			
		||||
      //
 | 
			
		||||
      //    (VV0*G0)_st,aa (VV1*G1)_ts,bb
 | 
			
		||||
@@ -1235,11 +1251,11 @@ Bag [8,4]  fig8 (-227.58,3.58808e-17) trtr (-32.5776,1.83286e-17)     //  - 1602
 | 
			
		||||
      }}}}
 | 
			
		||||
 | 
			
		||||
      if ( g==0 ) {
 | 
			
		||||
	O_trtr_v[ss] = v_trtr; 
 | 
			
		||||
	O_fig8_v[ss] = v_fig8;
 | 
			
		||||
	coalescedWrite(O_trtr_v[ss] , v_trtr); 
 | 
			
		||||
	coalescedWrite(O_fig8_v[ss] , v_fig8);
 | 
			
		||||
      } else { 
 | 
			
		||||
	O_trtr_v[ss]+= v_trtr; 
 | 
			
		||||
	O_fig8_v[ss]+= v_fig8;
 | 
			
		||||
	coalescedWrite(O_trtr_v[ss],O_trtr_v(ss) + v_trtr); 
 | 
			
		||||
	coalescedWrite(O_fig8_v[ss],O_fig8_v(ss) + v_fig8);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -7,6 +7,7 @@
 | 
			
		||||
 Copyright (C) 2019
 | 
			
		||||
 
 | 
			
		||||
 Author: Felix Erben <felix.erben@ed.ac.uk>
 | 
			
		||||
 Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
 This program is free software; you can redistribute it and/or modify
 | 
			
		||||
 it under the terms of the GNU General Public License as published by
 | 
			
		||||
@@ -46,7 +47,7 @@ public:
 | 
			
		||||
  typedef typename SpinMatrixField::vector_object sobj;
 | 
			
		||||
 | 
			
		||||
  static const int epsilon[6][3] ;
 | 
			
		||||
  static const Complex epsilon_sgn[6];
 | 
			
		||||
  static const Real epsilon_sgn[6];
 | 
			
		||||
 | 
			
		||||
  private: 
 | 
			
		||||
  template <class mobj, class robj>
 | 
			
		||||
@@ -58,9 +59,12 @@ public:
 | 
			
		||||
				 const Gamma GammaA_right,
 | 
			
		||||
				 const Gamma GammaB_right,
 | 
			
		||||
				 const int parity,
 | 
			
		||||
				 const int * wick_contractions,
 | 
			
		||||
				 const bool * wick_contractions,
 | 
			
		||||
  				 robj &result);
 | 
			
		||||
  public:
 | 
			
		||||
  static void Wick_Contractions(std::string qi, 
 | 
			
		||||
                 std::string qf, 
 | 
			
		||||
                 bool* wick_contractions);
 | 
			
		||||
  static void ContractBaryons(const PropagatorField &q1_left,
 | 
			
		||||
				 const PropagatorField &q2_left,
 | 
			
		||||
				 const PropagatorField &q3_left,
 | 
			
		||||
@@ -68,8 +72,7 @@ public:
 | 
			
		||||
				 const Gamma GammaB_left,
 | 
			
		||||
				 const Gamma GammaA_right,
 | 
			
		||||
				 const Gamma GammaB_right,
 | 
			
		||||
				 const char * quarks_left,
 | 
			
		||||
				 const char * quarks_right,
 | 
			
		||||
				 const bool* wick_contractions,
 | 
			
		||||
				 const int parity,
 | 
			
		||||
				 ComplexField &baryon_corr);
 | 
			
		||||
  template <class mobj, class robj>
 | 
			
		||||
@@ -80,10 +83,59 @@ public:
 | 
			
		||||
				 const Gamma GammaB_left,
 | 
			
		||||
				 const Gamma GammaA_right,
 | 
			
		||||
				 const Gamma GammaB_right,
 | 
			
		||||
				 const char * quarks_left,
 | 
			
		||||
				 const char * quarks_right,
 | 
			
		||||
				 const bool* wick_contractions,
 | 
			
		||||
				 const int parity,
 | 
			
		||||
				 const int nt,
 | 
			
		||||
				 robj &result);
 | 
			
		||||
  private:
 | 
			
		||||
  template <class mobj, class mobj2, class robj>
 | 
			
		||||
  static void Baryon_Gamma_3pt_Group1_Site(
 | 
			
		||||
           const mobj &Dq1_ti,
 | 
			
		||||
           const mobj2 &Dq2_spec,
 | 
			
		||||
           const mobj2 &Dq3_spec,
 | 
			
		||||
           const mobj &Dq4_tf,
 | 
			
		||||
                   const Gamma GammaJ,
 | 
			
		||||
                   const Gamma GammaBi,
 | 
			
		||||
                   const Gamma GammaBf,
 | 
			
		||||
           int wick_contraction,
 | 
			
		||||
           robj &result);
 | 
			
		||||
 | 
			
		||||
  template <class mobj, class mobj2, class robj>
 | 
			
		||||
  static void Baryon_Gamma_3pt_Group2_Site(
 | 
			
		||||
           const mobj2 &Dq1_spec,
 | 
			
		||||
           const mobj &Dq2_ti,
 | 
			
		||||
           const mobj2 &Dq3_spec,
 | 
			
		||||
           const mobj &Dq4_tf,
 | 
			
		||||
                   const Gamma GammaJ,
 | 
			
		||||
                   const Gamma GammaBi,
 | 
			
		||||
                   const Gamma GammaBf,
 | 
			
		||||
           int wick_contraction,
 | 
			
		||||
           robj &result);
 | 
			
		||||
 | 
			
		||||
  template <class mobj, class mobj2, class robj>
 | 
			
		||||
  static void Baryon_Gamma_3pt_Group3_Site(
 | 
			
		||||
           const mobj2 &Dq1_spec,
 | 
			
		||||
           const mobj2 &Dq2_spec,
 | 
			
		||||
           const mobj &Dq3_ti,
 | 
			
		||||
           const mobj &Dq4_tf,
 | 
			
		||||
                   const Gamma GammaJ,
 | 
			
		||||
                   const Gamma GammaBi,
 | 
			
		||||
                   const Gamma GammaBf,
 | 
			
		||||
           int wick_contraction,
 | 
			
		||||
           robj &result);
 | 
			
		||||
  public:
 | 
			
		||||
  template <class mobj>
 | 
			
		||||
  static void Baryon_Gamma_3pt(
 | 
			
		||||
           const PropagatorField &q_ti,
 | 
			
		||||
           const mobj &Dq_spec1,
 | 
			
		||||
           const mobj &Dq_spec2,
 | 
			
		||||
           const PropagatorField &q_tf,
 | 
			
		||||
           int group,
 | 
			
		||||
           int wick_contraction,
 | 
			
		||||
                   const Gamma GammaJ,
 | 
			
		||||
                   const Gamma GammaBi,
 | 
			
		||||
                   const Gamma GammaBf,
 | 
			
		||||
           SpinMatrixField &stn_corr);
 | 
			
		||||
  private: 
 | 
			
		||||
  template <class mobj, class mobj2, class robj>
 | 
			
		||||
  static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
 | 
			
		||||
@@ -151,119 +203,152 @@ public:
 | 
			
		||||
 | 
			
		||||
template <class FImpl> 
 | 
			
		||||
const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
 | 
			
		||||
template <class FImpl> 
 | 
			
		||||
/*template <class FImpl> 
 | 
			
		||||
const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
 | 
			
		||||
						    Complex(1),
 | 
			
		||||
						    Complex(1),
 | 
			
		||||
						    Complex(-1),
 | 
			
		||||
						    Complex(-1),
 | 
			
		||||
						    Complex(-1)};
 | 
			
		||||
*/
 | 
			
		||||
template <class FImpl> 
 | 
			
		||||
const Real BaryonUtils<FImpl>::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.};
 | 
			
		||||
 | 
			
		||||
//This is the old version
 | 
			
		||||
template <class FImpl>
 | 
			
		||||
template <class mobj, class robj>
 | 
			
		||||
void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 | 
			
		||||
						 const mobj &D2,
 | 
			
		||||
						 const mobj &D3,
 | 
			
		||||
				                 const Gamma GammaA_left,
 | 
			
		||||
				                 const Gamma GammaB_left,
 | 
			
		||||
				                 const Gamma GammaA_right,
 | 
			
		||||
		                 		 const Gamma GammaB_right,
 | 
			
		||||
						 const int parity,
 | 
			
		||||
						 const int * wick_contraction,
 | 
			
		||||
						 robj &result)
 | 
			
		||||
                const mobj &D2,
 | 
			
		||||
                const mobj &D3,
 | 
			
		||||
                         const Gamma GammaA_i,
 | 
			
		||||
                         const Gamma GammaB_i,
 | 
			
		||||
                         const Gamma GammaA_f,
 | 
			
		||||
                         const Gamma GammaB_f,
 | 
			
		||||
                const int parity,
 | 
			
		||||
                const bool * wick_contraction,
 | 
			
		||||
                robj &result)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
 | 
			
		||||
    Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
 | 
			
		||||
    
 | 
			
		||||
    auto D1_GAi =  D1 * GammaA_i;
 | 
			
		||||
    auto D1_GAi_g4 = D1_GAi * g4;
 | 
			
		||||
    auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4);
 | 
			
		||||
    auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P;
 | 
			
		||||
    auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P;
 | 
			
		||||
 | 
			
		||||
    auto gD1a = GammaA_left * GammaA_right * D1;
 | 
			
		||||
    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
 | 
			
		||||
    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
 | 
			
		||||
    auto gD3 = GammaB_right * D3;
 | 
			
		||||
    auto D2_GBi = D2 * GammaB_i;
 | 
			
		||||
    auto GBf_D2_GBi = GammaB_f * D2_GBi;
 | 
			
		||||
    auto GAf_D2_GBi = GammaA_f * D2_GBi;
 | 
			
		||||
 | 
			
		||||
    auto D2g = D2 * GammaB_left;
 | 
			
		||||
    auto pD1g = pD1 * GammaB_left;
 | 
			
		||||
    auto gD3g = gD3 * GammaB_left;
 | 
			
		||||
    auto GBf_D3 = GammaB_f * D3;
 | 
			
		||||
    auto GAf_D3 = GammaA_f * D3;
 | 
			
		||||
 | 
			
		||||
    for (int ie_left=0; ie_left < 6 ; ie_left++){
 | 
			
		||||
      int a_left = epsilon[ie_left][0]; //a
 | 
			
		||||
      int b_left = epsilon[ie_left][1]; //b
 | 
			
		||||
      int c_left = epsilon[ie_left][2]; //c
 | 
			
		||||
      for (int ie_right=0; ie_right < 6 ; ie_right++){
 | 
			
		||||
        int a_right = epsilon[ie_right][0]; //a'
 | 
			
		||||
        int b_right = epsilon[ie_right][1]; //b'
 | 
			
		||||
        int c_right = epsilon[ie_right][2]; //c'
 | 
			
		||||
	Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
 | 
			
		||||
    for (int ie_f=0; ie_f < 6 ; ie_f++){
 | 
			
		||||
        int a_f = epsilon[ie_f][0]; //a
 | 
			
		||||
        int b_f = epsilon[ie_f][1]; //b
 | 
			
		||||
        int c_f = epsilon[ie_f][2]; //c
 | 
			
		||||
    for (int ie_i=0; ie_i < 6 ; ie_i++){
 | 
			
		||||
        int a_i = epsilon[ie_i][0]; //a'
 | 
			
		||||
        int b_i = epsilon[ie_i][1]; //b'
 | 
			
		||||
        int c_i = epsilon[ie_i][2]; //c'
 | 
			
		||||
 | 
			
		||||
        Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
 | 
			
		||||
        //This is the \delta_{456}^{123} part
 | 
			
		||||
	if (wick_contraction[0]){
 | 
			
		||||
	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 | 
			
		||||
            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
 | 
			
		||||
	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 | 
			
		||||
	  for (int beta_left=0; beta_left<Ns; beta_left++){
 | 
			
		||||
	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
 | 
			
		||||
	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
 | 
			
		||||
	        result()()() += eepD1*D2g_ab*gD3_ab;
 | 
			
		||||
          }}}
 | 
			
		||||
  	}	  
 | 
			
		||||
        if (wick_contraction[0]){
 | 
			
		||||
            for (int rho=0; rho<Ns; rho++){
 | 
			
		||||
                auto GAf_D1_GAi_P_rr_cc = GAf_D1_GAi_P()(rho,rho)(c_f,c_i);
 | 
			
		||||
                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
 | 
			
		||||
                for (int beta_i=0; beta_i<Ns; beta_i++){
 | 
			
		||||
                    result()()() += ee  * GAf_D1_GAi_P_rr_cc
 | 
			
		||||
                                        * D2_GBi    ()(alpha_f,beta_i)(a_f,a_i)
 | 
			
		||||
                                        * GBf_D3    ()(alpha_f,beta_i)(b_f,b_i);
 | 
			
		||||
                }}
 | 
			
		||||
            }
 | 
			
		||||
        }   
 | 
			
		||||
        //This is the \delta_{456}^{231} part
 | 
			
		||||
	if (wick_contraction[1]){
 | 
			
		||||
	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 | 
			
		||||
	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 | 
			
		||||
            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
 | 
			
		||||
	  for (int beta_left=0; beta_left<Ns; beta_left++){
 | 
			
		||||
            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
 | 
			
		||||
	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
 | 
			
		||||
		result()()() += eepD1g_gb*D2_ab*gD3_ag;
 | 
			
		||||
          }}}
 | 
			
		||||
        }	  
 | 
			
		||||
        if (wick_contraction[1]){
 | 
			
		||||
            for (int rho=0; rho<Ns; rho++){
 | 
			
		||||
            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
 | 
			
		||||
                auto D1_GAi_P_ar_ac = D1_GAi_P()(alpha_f,rho)(a_f,c_i);
 | 
			
		||||
                for (int beta_i=0; beta_i<Ns; beta_i++){
 | 
			
		||||
                    result()()() += ee  * D1_GAi_P_ar_ac
 | 
			
		||||
                                        * GBf_D2_GBi    ()(alpha_f,beta_i)(b_f,a_i)
 | 
			
		||||
                                        * GAf_D3        ()(rho,beta_i)(c_f,b_i);
 | 
			
		||||
                }
 | 
			
		||||
            }}
 | 
			
		||||
        }   
 | 
			
		||||
        //This is the \delta_{456}^{312} part
 | 
			
		||||
	if (wick_contraction[2]){
 | 
			
		||||
	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 | 
			
		||||
	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 | 
			
		||||
	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
 | 
			
		||||
	  for (int beta_left=0; beta_left<Ns; beta_left++){
 | 
			
		||||
            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
 | 
			
		||||
	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
 | 
			
		||||
		result()()() += eepD1_gb*D2_ag*gD3g_ab;
 | 
			
		||||
          }}}
 | 
			
		||||
        }	  
 | 
			
		||||
        if (wick_contraction[2]){
 | 
			
		||||
            for (int rho=0; rho<Ns; rho++){
 | 
			
		||||
                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
 | 
			
		||||
                auto GBf_D1_GAi_P_ar_bc = GBf_D1_GAi_P()(alpha_f,rho)(b_f,c_i);
 | 
			
		||||
                for (int beta_i=0; beta_i<Ns; beta_i++){
 | 
			
		||||
                    result()()() += ee  * GBf_D1_GAi_P_ar_bc
 | 
			
		||||
                                        * GAf_D2_GBi    ()(rho,beta_i)(c_f,a_i)
 | 
			
		||||
                                        * D3            ()(alpha_f,beta_i)(a_f,b_i);
 | 
			
		||||
                }
 | 
			
		||||
            }}
 | 
			
		||||
        }   
 | 
			
		||||
        //This is the \delta_{456}^{132} part
 | 
			
		||||
	if (wick_contraction[3]){
 | 
			
		||||
	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 | 
			
		||||
            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
 | 
			
		||||
	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 | 
			
		||||
	  for (int beta_left=0; beta_left<Ns; beta_left++){
 | 
			
		||||
	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
 | 
			
		||||
	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
 | 
			
		||||
    		result()()() -= eepD1*D2_ab*gD3g_ab;
 | 
			
		||||
          }}}
 | 
			
		||||
        }	  
 | 
			
		||||
        if (wick_contraction[3]){
 | 
			
		||||
            for (int rho=0; rho<Ns; rho++){
 | 
			
		||||
                auto GAf_D1_GAi_P_rr_cc = GAf_D1_GAi_P()(rho,rho)(c_f,c_i);
 | 
			
		||||
                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
 | 
			
		||||
                for (int beta_i=0; beta_i<Ns; beta_i++){
 | 
			
		||||
                    result()()() -= ee  * GAf_D1_GAi_P_rr_cc
 | 
			
		||||
                                        * GBf_D2_GBi    ()(alpha_f,beta_i)(b_f,a_i)
 | 
			
		||||
                                        * D3            ()(alpha_f,beta_i)(a_f,b_i);
 | 
			
		||||
              }
 | 
			
		||||
        }}
 | 
			
		||||
        }   
 | 
			
		||||
        //This is the \delta_{456}^{321} part
 | 
			
		||||
	if (wick_contraction[4]){
 | 
			
		||||
	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 | 
			
		||||
	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 | 
			
		||||
            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
 | 
			
		||||
	  for (int beta_left=0; beta_left<Ns; beta_left++){
 | 
			
		||||
            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
 | 
			
		||||
	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
 | 
			
		||||
		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
 | 
			
		||||
          }}}
 | 
			
		||||
        }	  
 | 
			
		||||
        if (wick_contraction[4]){
 | 
			
		||||
            for (int rho=0; rho<Ns; rho++){
 | 
			
		||||
            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
 | 
			
		||||
                auto GBf_D1_GAi_P_ar_bc = GBf_D1_GAi_P()(alpha_f,rho)(b_f,c_i);
 | 
			
		||||
                for (int beta_i=0; beta_i<Ns; beta_i++){
 | 
			
		||||
                    result()()() -= ee  * GBf_D1_GAi_P_ar_bc
 | 
			
		||||
                                        * D2_GBi    ()(alpha_f,beta_i)(a_f,a_i)
 | 
			
		||||
                                        * GAf_D3    ()(rho,beta_i)(c_f,b_i);
 | 
			
		||||
                }
 | 
			
		||||
            }}
 | 
			
		||||
        }   
 | 
			
		||||
        //This is the \delta_{456}^{213} part
 | 
			
		||||
	if (wick_contraction[5]){
 | 
			
		||||
	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 | 
			
		||||
	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 | 
			
		||||
	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
 | 
			
		||||
	  for (int beta_left=0; beta_left<Ns; beta_left++){
 | 
			
		||||
            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
 | 
			
		||||
	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
 | 
			
		||||
    	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
 | 
			
		||||
          }}}
 | 
			
		||||
        }	  
 | 
			
		||||
      }
 | 
			
		||||
        if (wick_contraction[5]){
 | 
			
		||||
            for (int rho=0; rho<Ns; rho++){
 | 
			
		||||
            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
 | 
			
		||||
                auto D1_GAi_P_ar_ac = D1_GAi_P()(alpha_f,rho)(a_f,c_i);
 | 
			
		||||
                for (int beta_i=0; beta_i<Ns; beta_i++){
 | 
			
		||||
                    result()()() -= ee  * D1_GAi_P_ar_ac
 | 
			
		||||
                                        * GAf_D2_GBi    ()(rho,beta_i)(c_f,a_i)
 | 
			
		||||
                                        * GBf_D3        ()(alpha_f,beta_i)(b_f,b_i);
 | 
			
		||||
                }
 | 
			
		||||
            }}
 | 
			
		||||
        }
 | 
			
		||||
    }}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Computes which wick contractions should be performed for a    *
 | 
			
		||||
 * baryon 2pt function given the initial and finals state quark  *
 | 
			
		||||
 * flavours.                                                     *
 | 
			
		||||
 * The array wick_contractions must be of length 6               */
 | 
			
		||||
template<class FImpl>
 | 
			
		||||
void BaryonUtils<FImpl>::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) {
 | 
			
		||||
    const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
 | 
			
		||||
    for (int ie=0; ie < 6 ; ie++) {
 | 
			
		||||
        wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3
 | 
			
		||||
                                    && qi[0] == qf[epsilon[ie][0]] 
 | 
			
		||||
                                    && qi[1] == qf[epsilon[ie][1]] 
 | 
			
		||||
                                    && qi[2] == qf[epsilon[ie][2]]);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* The array wick_contractions must be of length 6. The order     * 
 | 
			
		||||
 * corresponds to the to that shown in the Hadrons documentation  *
 | 
			
		||||
 * at https://aportelli.github.io/Hadrons-doc/#/mcontraction      *
 | 
			
		||||
 * This can be computed from the quark flavours using the         *
 | 
			
		||||
 * Wick_Contractions function above                               */
 | 
			
		||||
template<class FImpl>
 | 
			
		||||
void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 | 
			
		||||
						 const PropagatorField &q2_left,
 | 
			
		||||
@@ -272,8 +357,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 | 
			
		||||
				                 const Gamma GammaB_left,
 | 
			
		||||
				                 const Gamma GammaA_right,
 | 
			
		||||
		                 		 const Gamma GammaB_right,
 | 
			
		||||
						 const char * quarks_left,
 | 
			
		||||
						 const char * quarks_right,
 | 
			
		||||
						 const bool* wick_contractions,
 | 
			
		||||
						 const int parity,
 | 
			
		||||
						 ComplexField &baryon_corr)
 | 
			
		||||
{
 | 
			
		||||
@@ -281,38 +365,53 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 | 
			
		||||
  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
 | 
			
		||||
  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 | 
			
		||||
 | 
			
		||||
  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
 | 
			
		||||
    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
 | 
			
		||||
    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
 | 
			
		||||
    std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
 | 
			
		||||
    std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
 | 
			
		||||
  std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
 | 
			
		||||
  std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
 | 
			
		||||
  std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
 | 
			
		||||
  std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
 | 
			
		||||
 
 | 
			
		||||
  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 | 
			
		||||
 | 
			
		||||
  GridBase *grid = q1_left.Grid();
 | 
			
		||||
  
 | 
			
		||||
  autoView(vbaryon_corr, baryon_corr,CpuWrite);
 | 
			
		||||
  autoView( v1 , q1_left, CpuRead);
 | 
			
		||||
  autoView( v2 , q2_left, CpuRead);
 | 
			
		||||
  autoView( v3 , q3_left, CpuRead);
 | 
			
		||||
 | 
			
		||||
  int wick_contraction[6];
 | 
			
		||||
  for (int ie=0; ie < 6 ; ie++)
 | 
			
		||||
    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
 | 
			
		||||
 | 
			
		||||
  auto vbaryon_corr= baryon_corr.View();
 | 
			
		||||
  auto v1 = q1_left.View();
 | 
			
		||||
  auto v2 = q2_left.View();
 | 
			
		||||
  auto v3 = q3_left.View();
 | 
			
		||||
 | 
			
		||||
 // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
 | 
			
		||||
  thread_for(ss,grid->oSites(),{
 | 
			
		||||
  //for(int ss=0; ss < grid->oSites(); ss++){
 | 
			
		||||
  Real bytes =0.;
 | 
			
		||||
  bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
 | 
			
		||||
  for (int ie=0; ie < 6 ; ie++){
 | 
			
		||||
    if(ie==0 or ie==3){
 | 
			
		||||
       bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie];
 | 
			
		||||
    }
 | 
			
		||||
    else{
 | 
			
		||||
       bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  Real t=0.;
 | 
			
		||||
  t =-usecond();
 | 
			
		||||
 | 
			
		||||
  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
 | 
			
		||||
    auto D1 = v1[ss];
 | 
			
		||||
    auto D2 = v2[ss];
 | 
			
		||||
    auto D3 = v3[ss];
 | 
			
		||||
 | 
			
		||||
    vobj result=Zero();
 | 
			
		||||
    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
 | 
			
		||||
    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
 | 
			
		||||
    vbaryon_corr[ss] = result; 
 | 
			
		||||
  }  );//end loop over lattice sites
 | 
			
		||||
 | 
			
		||||
  t += usecond();
 | 
			
		||||
 | 
			
		||||
  std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* The array wick_contractions must be of length 6. The order     * 
 | 
			
		||||
 * corresponds to the to that shown in the Hadrons documentation  *
 | 
			
		||||
 * at https://aportelli.github.io/Hadrons-doc/#/mcontraction      *
 | 
			
		||||
 * This can also be computed from the quark flavours using the    *
 | 
			
		||||
 * Wick_Contractions function above                               */
 | 
			
		||||
template <class FImpl>
 | 
			
		||||
template <class mobj, class robj>
 | 
			
		||||
void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 | 
			
		||||
@@ -322,34 +421,363 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 | 
			
		||||
				                 const Gamma GammaB_left,
 | 
			
		||||
				                 const Gamma GammaA_right,
 | 
			
		||||
		                 		 const Gamma GammaB_right,
 | 
			
		||||
						 const char * quarks_left,
 | 
			
		||||
						 const char * quarks_right,
 | 
			
		||||
						 const bool* wick_contractions,
 | 
			
		||||
						 const int parity,
 | 
			
		||||
						 const int nt,
 | 
			
		||||
						 robj &result)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
 | 
			
		||||
  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 | 
			
		||||
 | 
			
		||||
  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
 | 
			
		||||
    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
 | 
			
		||||
    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
 | 
			
		||||
    std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
 | 
			
		||||
    std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
 | 
			
		||||
  std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
 | 
			
		||||
  std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
 | 
			
		||||
  std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
 | 
			
		||||
  std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
 | 
			
		||||
 
 | 
			
		||||
  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 | 
			
		||||
 | 
			
		||||
  int wick_contraction[6];
 | 
			
		||||
  for (int ie=0; ie < 6 ; ie++)
 | 
			
		||||
    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
 | 
			
		||||
 | 
			
		||||
     result=Zero();
 | 
			
		||||
     baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
 | 
			
		||||
  for (int t=0; t<nt; t++) {
 | 
			
		||||
    baryon_site(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/***********************************************************************
 | 
			
		||||
 * End of Baryon 2pt-function code.                                    *
 | 
			
		||||
 *                                                                     *
 | 
			
		||||
 * The following code is for baryonGamma3pt function                   *
 | 
			
		||||
 **********************************************************************/
 | 
			
		||||
 | 
			
		||||
/* Dq1_ti is a quark line from t_i to t_J
 | 
			
		||||
 * Dq2_spec is a quark line from t_i to t_f
 | 
			
		||||
 * Dq3_spec is a quark line from t_i to t_f
 | 
			
		||||
 * Dq4_tf is a quark line from t_f to t_J */
 | 
			
		||||
template<class FImpl>
 | 
			
		||||
template <class mobj, class mobj2, class robj>
 | 
			
		||||
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site(
 | 
			
		||||
                        const mobj &Dq1_ti,
 | 
			
		||||
                        const mobj2 &Dq2_spec,
 | 
			
		||||
                        const mobj2 &Dq3_spec,
 | 
			
		||||
                        const mobj &Dq4_tf,
 | 
			
		||||
                                const Gamma GammaJ,
 | 
			
		||||
                                const Gamma GammaBi,
 | 
			
		||||
                                const Gamma GammaBf,
 | 
			
		||||
                        int wick_contraction,
 | 
			
		||||
                        robj &result)
 | 
			
		||||
{
 | 
			
		||||
    Gamma g5(Gamma::Algebra::Gamma5); 
 | 
			
		||||
 | 
			
		||||
    auto adjD4_g_D1     = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq1_ti;
 | 
			
		||||
    auto Gf_adjD4_g_D1  = GammaBf * adjD4_g_D1;
 | 
			
		||||
    auto D2_Gi          = Dq2_spec * GammaBi;
 | 
			
		||||
    auto Gf_D2_Gi       = GammaBf * D2_Gi;
 | 
			
		||||
    auto Gf_D3          = GammaBf * Dq3_spec;
 | 
			
		||||
 | 
			
		||||
    int a_f, b_f, c_f;
 | 
			
		||||
    int a_i, b_i, c_i;
 | 
			
		||||
 | 
			
		||||
    Real ee;
 | 
			
		||||
 | 
			
		||||
    for (int ie_f=0; ie_f < 6 ; ie_f++){
 | 
			
		||||
        a_f = epsilon[ie_f][0]; //a
 | 
			
		||||
        b_f = epsilon[ie_f][1]; //b
 | 
			
		||||
        c_f = epsilon[ie_f][2]; //c
 | 
			
		||||
    for (int ie_i=0; ie_i < 6 ; ie_i++){
 | 
			
		||||
        a_i = epsilon[ie_i][0]; //a'
 | 
			
		||||
        b_i = epsilon[ie_i][1]; //b'
 | 
			
		||||
        c_i = epsilon[ie_i][2]; //c'
 | 
			
		||||
 | 
			
		||||
        ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
 | 
			
		||||
 | 
			
		||||
        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
 | 
			
		||||
        for (int beta_i=0; beta_i<Ns; beta_i++){
 | 
			
		||||
            auto D2_Gi_ab_aa        = D2_Gi     ()(alpha_f,beta_i)(a_f,a_i);
 | 
			
		||||
            auto Gf_D3_ab_bb        = Gf_D3     ()(alpha_f,beta_i)(b_f,b_i);
 | 
			
		||||
            auto Gf_D2_Gi_ab_ba     = Gf_D2_Gi  ()(alpha_f,beta_i)(b_f,a_i);
 | 
			
		||||
            auto Dq3_spec_ab_ab     = Dq3_spec  ()(alpha_f,beta_i)(a_f,b_i);
 | 
			
		||||
 | 
			
		||||
            for (int gamma_i=0; gamma_i<Ns; gamma_i++){
 | 
			
		||||
                auto ee_adjD4_g_D1_ag_ac        = ee * adjD4_g_D1   ()(alpha_f,gamma_i)(a_f,c_i);
 | 
			
		||||
                auto ee_Gf_adjD4_g_D1_ag_bc     = ee * Gf_adjD4_g_D1()(alpha_f,gamma_i)(b_f,c_i);
 | 
			
		||||
            for (int gamma_f=0; gamma_f<Ns; gamma_f++){
 | 
			
		||||
                auto ee_adjD4_g_D1_gg_cc        = ee * adjD4_g_D1   ()(gamma_f,gamma_i)(c_f,c_i);
 | 
			
		||||
                auto Dq3_spec_gb_cb             = Dq3_spec          ()(gamma_f,beta_i)(c_f,b_i);
 | 
			
		||||
                auto D2_Gi_gb_ca                = D2_Gi             ()(gamma_f,beta_i)(c_f,a_i);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
                if(wick_contraction == 1) { // Do contraction I1
 | 
			
		||||
                    result()(gamma_f,gamma_i)() -= ee_adjD4_g_D1_gg_cc
 | 
			
		||||
                                                        * D2_Gi_ab_aa
 | 
			
		||||
                                                        * Gf_D3_ab_bb;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 2) { // Do contraction I2
 | 
			
		||||
                    result()(gamma_f,gamma_i)() -= ee_adjD4_g_D1_ag_ac
 | 
			
		||||
                                                        * Gf_D2_Gi_ab_ba
 | 
			
		||||
                                                        * Dq3_spec_gb_cb;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 3) { // Do contraction I3
 | 
			
		||||
                    result()(gamma_f,gamma_i)() -= ee_Gf_adjD4_g_D1_ag_bc
 | 
			
		||||
                                                        * D2_Gi_gb_ca
 | 
			
		||||
                                                        * Dq3_spec_ab_ab;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 4) { // Do contraction I4
 | 
			
		||||
                    result()(gamma_f,gamma_i)() += ee_adjD4_g_D1_gg_cc
 | 
			
		||||
                                                        * Gf_D2_Gi_ab_ba
 | 
			
		||||
                                                        * Dq3_spec_ab_ab;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 5) { // Do contraction I5
 | 
			
		||||
                    result()(gamma_f,gamma_i)() += ee_Gf_adjD4_g_D1_ag_bc
 | 
			
		||||
                                                        * D2_Gi_ab_aa
 | 
			
		||||
                                                        * Dq3_spec_gb_cb;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 6) { // Do contraction I6
 | 
			
		||||
                    result()(gamma_f,gamma_i)() += ee_adjD4_g_D1_ag_ac
 | 
			
		||||
                                                        * D2_Gi_gb_ca
 | 
			
		||||
                                                        * Gf_D3_ab_bb;
 | 
			
		||||
                }
 | 
			
		||||
            }}
 | 
			
		||||
        }}
 | 
			
		||||
    }}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Dq1_spec is a quark line from t_i to t_f
 | 
			
		||||
 * Dq2_ti is a quark line from t_i to t_J
 | 
			
		||||
 * Dq3_spec is a quark line from t_i to t_f
 | 
			
		||||
 * Dq4_tf is a quark line from t_f to t_J */
 | 
			
		||||
template<class FImpl>
 | 
			
		||||
template <class mobj, class mobj2, class robj>
 | 
			
		||||
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site(
 | 
			
		||||
                        const mobj2 &Dq1_spec,
 | 
			
		||||
                        const mobj &Dq2_ti,
 | 
			
		||||
                        const mobj2 &Dq3_spec,
 | 
			
		||||
                        const mobj &Dq4_tf,
 | 
			
		||||
                                const Gamma GammaJ,
 | 
			
		||||
                                const Gamma GammaBi,
 | 
			
		||||
                                const Gamma GammaBf,
 | 
			
		||||
                        int wick_contraction,
 | 
			
		||||
                        robj &result)
 | 
			
		||||
{
 | 
			
		||||
    Gamma g5(Gamma::Algebra::Gamma5); 
 | 
			
		||||
 | 
			
		||||
    auto adjD4_g_D2_Gi      = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi;
 | 
			
		||||
    auto Gf_adjD4_g_D2_Gi   = GammaBf * adjD4_g_D2_Gi;
 | 
			
		||||
    auto Gf_D1              = GammaBf * Dq1_spec;
 | 
			
		||||
    auto Gf_D3              = GammaBf * Dq3_spec;
 | 
			
		||||
 | 
			
		||||
    int a_f, b_f, c_f;
 | 
			
		||||
    int a_i, b_i, c_i;
 | 
			
		||||
 | 
			
		||||
    Real ee;
 | 
			
		||||
 | 
			
		||||
    for (int ie_f=0; ie_f < 6 ; ie_f++){
 | 
			
		||||
        a_f = epsilon[ie_f][0]; //a
 | 
			
		||||
        b_f = epsilon[ie_f][1]; //b
 | 
			
		||||
        c_f = epsilon[ie_f][2]; //c
 | 
			
		||||
    for (int ie_i=0; ie_i < 6 ; ie_i++){
 | 
			
		||||
        a_i = epsilon[ie_i][0]; //a'
 | 
			
		||||
        b_i = epsilon[ie_i][1]; //b'
 | 
			
		||||
        c_i = epsilon[ie_i][2]; //c'
 | 
			
		||||
 | 
			
		||||
        ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
 | 
			
		||||
 | 
			
		||||
        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
 | 
			
		||||
        for (int beta_i=0; beta_i<Ns; beta_i++){
 | 
			
		||||
            auto adjD4_g_D2_Gi_ab_aa        = adjD4_g_D2_Gi     ()(alpha_f,beta_i)(a_f,a_i);
 | 
			
		||||
            auto Gf_D3_ab_bb                = Gf_D3             ()(alpha_f,beta_i)(b_f,b_i);
 | 
			
		||||
            auto Gf_adjD4_g_D2_Gi_ab_ba     = Gf_adjD4_g_D2_Gi  ()(alpha_f,beta_i)(b_f,a_i);
 | 
			
		||||
            auto Dq3_spec_ab_ab             = Dq3_spec          ()(alpha_f,beta_i)(a_f,b_i);
 | 
			
		||||
 | 
			
		||||
            for (int gamma_i=0; gamma_i<Ns; gamma_i++){ 
 | 
			
		||||
                auto ee_Dq1_spec_ag_ac      = ee * Dq1_spec     ()(alpha_f,gamma_i)(a_f,c_i);
 | 
			
		||||
                auto ee_Gf_D1_ag_bc         = ee * Gf_D1        ()(alpha_f,gamma_i)(b_f,c_i);
 | 
			
		||||
            for (int gamma_f=0; gamma_f<Ns; gamma_f++){
 | 
			
		||||
                auto ee_Dq1_spec_gg_cc      = ee * Dq1_spec     ()(gamma_f,gamma_i)(c_f,c_i);
 | 
			
		||||
                auto Dq3_spec_gb_cb         = Dq3_spec          ()(gamma_f,beta_i)(c_f,b_i);
 | 
			
		||||
                auto adjD4_g_D2_Gi_gb_ca    = adjD4_g_D2_Gi     ()(gamma_f,beta_i)(c_f,a_i);
 | 
			
		||||
 | 
			
		||||
                if(wick_contraction == 1) { // Do contraction II1
 | 
			
		||||
                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_gg_cc
 | 
			
		||||
                                                        * adjD4_g_D2_Gi_ab_aa
 | 
			
		||||
                                                        * Gf_D3_ab_bb;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 2) { // Do contraction II2
 | 
			
		||||
                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_ag_ac
 | 
			
		||||
                                                        * Gf_adjD4_g_D2_Gi_ab_ba
 | 
			
		||||
                                                        * Dq3_spec_gb_cb;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 3) { // Do contraction II3
 | 
			
		||||
                    result()(gamma_f,gamma_i)() -= ee_Gf_D1_ag_bc
 | 
			
		||||
                                                        * adjD4_g_D2_Gi_gb_ca
 | 
			
		||||
                                                        * Dq3_spec_ab_ab;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 4) { // Do contraction II4
 | 
			
		||||
                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_gg_cc
 | 
			
		||||
                                                        * Gf_adjD4_g_D2_Gi_ab_ba
 | 
			
		||||
                                                        * Dq3_spec_ab_ab;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 5) { // Do contraction II5
 | 
			
		||||
                    result()(gamma_f,gamma_i)() += ee_Gf_D1_ag_bc
 | 
			
		||||
                                                        * adjD4_g_D2_Gi_ab_aa
 | 
			
		||||
                                                        * Dq3_spec_gb_cb;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 6) { // Do contraction II6
 | 
			
		||||
                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_ag_ac
 | 
			
		||||
                                                        * adjD4_g_D2_Gi_gb_ca
 | 
			
		||||
                                                        * Gf_D3_ab_bb;
 | 
			
		||||
                }
 | 
			
		||||
            }}
 | 
			
		||||
        }}
 | 
			
		||||
    }}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Dq1_spec is a quark line from t_i to t_f
 | 
			
		||||
 * Dq2_spec is a quark line from t_i to t_f
 | 
			
		||||
 * Dq3_ti is a quark line from t_i to t_J
 | 
			
		||||
 * Dq4_tf is a quark line from t_f to t_J */
 | 
			
		||||
template<class FImpl>
 | 
			
		||||
template <class mobj, class mobj2, class robj>
 | 
			
		||||
void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site(
 | 
			
		||||
                        const mobj2 &Dq1_spec,
 | 
			
		||||
                        const mobj2 &Dq2_spec,
 | 
			
		||||
                        const mobj &Dq3_ti,
 | 
			
		||||
                        const mobj &Dq4_tf,
 | 
			
		||||
                                const Gamma GammaJ,
 | 
			
		||||
                                const Gamma GammaBi,
 | 
			
		||||
                                const Gamma GammaBf,
 | 
			
		||||
                        int wick_contraction,
 | 
			
		||||
                        robj &result)
 | 
			
		||||
{
 | 
			
		||||
    Gamma g5(Gamma::Algebra::Gamma5);
 | 
			
		||||
 | 
			
		||||
    auto adjD4_g_D3     = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti;
 | 
			
		||||
    auto Gf_adjD4_g_D3  = GammaBf * adjD4_g_D3;
 | 
			
		||||
    auto Gf_D1          = GammaBf * Dq1_spec;
 | 
			
		||||
    auto D2_Gi          = Dq2_spec * GammaBi;
 | 
			
		||||
    auto Gf_D2_Gi       = GammaBf * D2_Gi;
 | 
			
		||||
 | 
			
		||||
    int a_f, b_f, c_f;
 | 
			
		||||
    int a_i, b_i, c_i;
 | 
			
		||||
 | 
			
		||||
    Real ee;
 | 
			
		||||
 | 
			
		||||
    for (int ie_f=0; ie_f < 6 ; ie_f++){
 | 
			
		||||
        a_f = epsilon[ie_f][0]; //a
 | 
			
		||||
        b_f = epsilon[ie_f][1]; //b
 | 
			
		||||
        c_f = epsilon[ie_f][2]; //c
 | 
			
		||||
    for (int ie_i=0; ie_i < 6 ; ie_i++){
 | 
			
		||||
        a_i = epsilon[ie_i][0]; //a'
 | 
			
		||||
        b_i = epsilon[ie_i][1]; //b'
 | 
			
		||||
        c_i = epsilon[ie_i][2]; //c'
 | 
			
		||||
 | 
			
		||||
        ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
 | 
			
		||||
 | 
			
		||||
        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
 | 
			
		||||
        for (int beta_i=0; beta_i<Ns; beta_i++){
 | 
			
		||||
            auto D2_Gi_ab_aa            = D2_Gi         ()(alpha_f,beta_i)(a_f,a_i);
 | 
			
		||||
            auto Gf_adjD4_g_D3_ab_bb    = Gf_adjD4_g_D3 ()(alpha_f,beta_i)(b_f,b_i);
 | 
			
		||||
            auto Gf_D2_Gi_ab_ba         = Gf_D2_Gi      ()(alpha_f,beta_i)(b_f,a_i);
 | 
			
		||||
            auto adjD4_g_D3_ab_ab       = adjD4_g_D3    ()(alpha_f,beta_i)(a_f,b_i);
 | 
			
		||||
 | 
			
		||||
            for (int gamma_i=0; gamma_i<Ns; gamma_i++) {
 | 
			
		||||
                auto ee_Dq1_spec_ag_ac  = ee * Dq1_spec ()(alpha_f,gamma_i)(a_f,c_i);
 | 
			
		||||
                auto ee_Gf_D1_ag_bc     = ee * Gf_D1    ()(alpha_f,gamma_i)(b_f,c_i);
 | 
			
		||||
            for (int gamma_f=0; gamma_f<Ns; gamma_f++) {
 | 
			
		||||
                auto ee_Dq1_spec_gg_cc  = ee * Dq1_spec ()(gamma_f,gamma_i)(c_f,c_i);
 | 
			
		||||
                auto adjD4_g_D3_gb_cb   = adjD4_g_D3    ()(gamma_f,beta_i)(c_f,b_i);
 | 
			
		||||
                auto D2_Gi_gb_ca        = D2_Gi         ()(gamma_f,beta_i)(c_f,a_i);
 | 
			
		||||
 | 
			
		||||
                if(wick_contraction == 1) { // Do contraction III1
 | 
			
		||||
                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_gg_cc
 | 
			
		||||
                                                        * D2_Gi_ab_aa
 | 
			
		||||
                                                        * Gf_adjD4_g_D3_ab_bb;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 2) { // Do contraction III2
 | 
			
		||||
                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_ag_ac
 | 
			
		||||
                                                        * Gf_D2_Gi_ab_ba
 | 
			
		||||
                                                        * adjD4_g_D3_gb_cb;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 3) { // Do contraction III3
 | 
			
		||||
                    result()(gamma_f,gamma_i)() -= ee_Gf_D1_ag_bc
 | 
			
		||||
                                                        * D2_Gi_gb_ca
 | 
			
		||||
                                                        * adjD4_g_D3_ab_ab;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 4) { // Do contraction III4
 | 
			
		||||
                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_gg_cc
 | 
			
		||||
                                                        * Gf_D2_Gi_ab_ba
 | 
			
		||||
                                                        * adjD4_g_D3_ab_ab;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 5) { // Do contraction III5
 | 
			
		||||
                    result()(gamma_f,gamma_i)() += ee_Gf_D1_ag_bc
 | 
			
		||||
                                                        * D2_Gi_ab_aa
 | 
			
		||||
                                                        * adjD4_g_D3_gb_cb;
 | 
			
		||||
                }
 | 
			
		||||
                if(wick_contraction == 6) { // Do contraction III6
 | 
			
		||||
                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_ag_ac
 | 
			
		||||
                                                        * D2_Gi_gb_ca
 | 
			
		||||
                                                        * Gf_adjD4_g_D3_ab_bb;
 | 
			
		||||
                }
 | 
			
		||||
            }}
 | 
			
		||||
        }}
 | 
			
		||||
    }}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* The group indicates which inital state quarks the current is  * 
 | 
			
		||||
 * connected to. It must be in the range 1-3.                    *
 | 
			
		||||
 * The wick_contraction must be in the range 1-6 correspond to   *
 | 
			
		||||
 * the contractions given in the Hadrons documentation at        *
 | 
			
		||||
 * https://aportelli.github.io/Hadrons-doc/#/mcontraction        */
 | 
			
		||||
template<class FImpl>
 | 
			
		||||
template <class mobj>
 | 
			
		||||
void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
 | 
			
		||||
                        const PropagatorField &q_ti,
 | 
			
		||||
                        const mobj &Dq_spec1,
 | 
			
		||||
                        const mobj &Dq_spec2,
 | 
			
		||||
                        const PropagatorField &q_tf,
 | 
			
		||||
                        int group,
 | 
			
		||||
                        int wick_contraction,
 | 
			
		||||
                                const Gamma GammaJ,
 | 
			
		||||
                                const Gamma GammaBi,
 | 
			
		||||
                                const Gamma GammaBf,
 | 
			
		||||
                        SpinMatrixField &stn_corr)
 | 
			
		||||
{
 | 
			
		||||
    GridBase *grid = q_tf.Grid();
 | 
			
		||||
 | 
			
		||||
    autoView( vcorr, stn_corr, CpuWrite);
 | 
			
		||||
    autoView( vq_ti , q_ti, CpuRead);
 | 
			
		||||
    autoView( vq_tf , q_tf, CpuRead);
 | 
			
		||||
 | 
			
		||||
    if (group == 1) {
 | 
			
		||||
        accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
 | 
			
		||||
            auto Dq_ti = vq_ti[ss];
 | 
			
		||||
            auto Dq_tf = vq_tf[ss];
 | 
			
		||||
            sobj result=Zero();
 | 
			
		||||
            Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
 | 
			
		||||
            vcorr[ss] += result; 
 | 
			
		||||
        });//end loop over lattice sites
 | 
			
		||||
    } else if (group == 2) {
 | 
			
		||||
        accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
 | 
			
		||||
            auto Dq_ti = vq_ti[ss];
 | 
			
		||||
            auto Dq_tf = vq_tf[ss];
 | 
			
		||||
            sobj result=Zero();
 | 
			
		||||
            Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
 | 
			
		||||
            vcorr[ss] += result; 
 | 
			
		||||
        });//end loop over lattice sites
 | 
			
		||||
    } else if (group == 3) {
 | 
			
		||||
        accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
 | 
			
		||||
            auto Dq_ti = vq_ti[ss];
 | 
			
		||||
            auto Dq_tf = vq_tf[ss];
 | 
			
		||||
            sobj result=Zero();
 | 
			
		||||
            Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
 | 
			
		||||
 | 
			
		||||
            vcorr[ss] += result; 
 | 
			
		||||
        });//end loop over lattice sites
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/***********************************************************************
 | 
			
		||||
 * End of BaryonGamma3pt-function code.                                *
 | 
			
		||||
 *																	   *
 | 
			
		||||
 * The following code is for Sigma -> N rare hypeon decays             *
 | 
			
		||||
 **********************************************************************/
 | 
			
		||||
 | 
			
		||||
@@ -590,13 +1018,12 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
 | 
			
		||||
 | 
			
		||||
  GridBase *grid = qs_ti.Grid();
 | 
			
		||||
 | 
			
		||||
  auto vcorr= stn_corr.View();
 | 
			
		||||
  auto vq_loop = qq_loop.View();
 | 
			
		||||
  auto vd_tf = qd_tf.View();
 | 
			
		||||
  auto vs_ti = qs_ti.View();
 | 
			
		||||
  autoView( vcorr, stn_corr, CpuWrite);
 | 
			
		||||
  autoView( vq_loop , qq_loop, CpuRead);
 | 
			
		||||
  autoView( vd_tf , qd_tf, CpuRead);
 | 
			
		||||
  autoView( vs_ti , qs_ti, CpuRead);
 | 
			
		||||
 | 
			
		||||
 // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
 | 
			
		||||
  thread_for(ss,grid->oSites(),{
 | 
			
		||||
  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
 | 
			
		||||
    auto Dq_loop = vq_loop[ss];
 | 
			
		||||
    auto Dd_tf = vd_tf[ss];
 | 
			
		||||
    auto Ds_ti = vs_ti[ss];
 | 
			
		||||
@@ -631,12 +1058,11 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
 | 
			
		||||
 | 
			
		||||
  GridBase *grid = qs_ti.Grid();
 | 
			
		||||
 | 
			
		||||
  auto vcorr= stn_corr.View();
 | 
			
		||||
  auto vq_ti = qq_ti.View();
 | 
			
		||||
  auto vq_tf = qq_tf.View();
 | 
			
		||||
  auto vd_tf = qd_tf.View();
 | 
			
		||||
  auto vs_ti = qs_ti.View();
 | 
			
		||||
 | 
			
		||||
  autoView( vcorr , stn_corr, CpuWrite);
 | 
			
		||||
  autoView( vq_ti , qq_ti, CpuRead);
 | 
			
		||||
  autoView( vq_tf , qq_tf, CpuRead);
 | 
			
		||||
  autoView( vd_tf , qd_tf, CpuRead);
 | 
			
		||||
  autoView( vs_ti , qs_ti, CpuRead);
 | 
			
		||||
 // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
 | 
			
		||||
  thread_for(ss,grid->oSites(),{
 | 
			
		||||
    auto Dq_ti = vq_ti[ss];
 | 
			
		||||
 
 | 
			
		||||
@@ -47,8 +47,8 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
 | 
			
		||||
  GridBase *grid=x.Grid();
 | 
			
		||||
 | 
			
		||||
  Gamma G5(Gamma::Algebra::Gamma5);
 | 
			
		||||
  auto x_v = x.View();
 | 
			
		||||
  auto z_v = z.View();
 | 
			
		||||
  autoView(x_v, x, AcceleratorRead);
 | 
			
		||||
  autoView(z_v, z, AcceleratorWrite);
 | 
			
		||||
  accelerator_for( ss, x_v.size(),vobj::Nsimd(), {
 | 
			
		||||
    auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss)));
 | 
			
		||||
    coalescedWrite(z_v[ss],tmp);
 | 
			
		||||
@@ -63,9 +63,9 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
 | 
			
		||||
  conformable(x,z);
 | 
			
		||||
  GridBase *grid=x.Grid();
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
  auto x_v = x.View();
 | 
			
		||||
  auto y_v = y.View();
 | 
			
		||||
  auto z_v = z.View();
 | 
			
		||||
  autoView( x_v, x, AcceleratorRead);
 | 
			
		||||
  autoView( y_v, y, AcceleratorRead);
 | 
			
		||||
  autoView( z_v, z, AcceleratorWrite);
 | 
			
		||||
  // FIXME -- need a new class of accelerator_loop to implement this
 | 
			
		||||
  //
 | 
			
		||||
  uint64_t nloop = grid->oSites()/Ls;
 | 
			
		||||
@@ -85,9 +85,9 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
 | 
			
		||||
  GridBase *grid=x.Grid();
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
  Gamma G5(Gamma::Algebra::Gamma5);
 | 
			
		||||
  auto x_v = x.View();
 | 
			
		||||
  auto y_v = y.View();
 | 
			
		||||
  auto z_v = z.View();
 | 
			
		||||
  autoView( x_v, x, AcceleratorRead);
 | 
			
		||||
  autoView( y_v, y, AcceleratorRead);
 | 
			
		||||
  autoView( z_v, z, AcceleratorWrite);
 | 
			
		||||
  uint64_t nloop = grid->oSites()/Ls;
 | 
			
		||||
  accelerator_for(sss,nloop,vobj::Nsimd(),{
 | 
			
		||||
    uint64_t ss = sss*Ls;
 | 
			
		||||
@@ -104,9 +104,9 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
 | 
			
		||||
  conformable(x,z);
 | 
			
		||||
  GridBase *grid=x.Grid();
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
  auto x_v = x.View();
 | 
			
		||||
  auto y_v = y.View();
 | 
			
		||||
  auto z_v = z.View();
 | 
			
		||||
  autoView( x_v, x, AcceleratorRead);
 | 
			
		||||
  autoView( y_v, y, AcceleratorRead);
 | 
			
		||||
  autoView( z_v, z, AcceleratorWrite);
 | 
			
		||||
  Gamma G5(Gamma::Algebra::Gamma5);
 | 
			
		||||
  uint64_t nloop = grid->oSites()/Ls;
 | 
			
		||||
  accelerator_for(sss,nloop,vobj::Nsimd(),{
 | 
			
		||||
@@ -125,9 +125,9 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
 | 
			
		||||
  GridBase *grid=x.Grid();
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
  auto x_v = x.View();
 | 
			
		||||
  auto y_v = y.View();
 | 
			
		||||
  auto z_v = z.View();
 | 
			
		||||
  autoView( x_v, x, AcceleratorRead);
 | 
			
		||||
  autoView( y_v, y, AcceleratorRead);
 | 
			
		||||
  autoView( z_v, z, AcceleratorWrite);
 | 
			
		||||
  Gamma G5(Gamma::Algebra::Gamma5);
 | 
			
		||||
  uint64_t nloop = grid->oSites()/Ls;
 | 
			
		||||
  accelerator_for(sss,nloop,vobj::Nsimd(),{
 | 
			
		||||
@@ -147,9 +147,9 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
 | 
			
		||||
  GridBase *grid=x.Grid();
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
  auto x_v = x.View();
 | 
			
		||||
  auto y_v = y.View();
 | 
			
		||||
  auto z_v = z.View();
 | 
			
		||||
  autoView( x_v, x, AcceleratorRead);
 | 
			
		||||
  autoView( y_v, y, AcceleratorRead);
 | 
			
		||||
  autoView( z_v, z, AcceleratorWrite);
 | 
			
		||||
  uint64_t nloop = grid->oSites()/Ls;
 | 
			
		||||
  accelerator_for(sss,nloop,vobj::Nsimd(),{
 | 
			
		||||
    uint64_t ss = sss*Ls;
 | 
			
		||||
@@ -168,9 +168,9 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
 | 
			
		||||
  conformable(x,z);
 | 
			
		||||
  GridBase *grid=x.Grid();
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
  auto x_v = x.View();
 | 
			
		||||
  auto y_v = y.View();
 | 
			
		||||
  auto z_v = z.View();
 | 
			
		||||
  autoView( x_v, x, AcceleratorRead);
 | 
			
		||||
  autoView( y_v, y, AcceleratorRead);
 | 
			
		||||
  autoView( z_v, z, AcceleratorWrite);
 | 
			
		||||
  uint64_t nloop = grid->oSites()/Ls;
 | 
			
		||||
  accelerator_for(sss,nloop,vobj::Nsimd(),{
 | 
			
		||||
    uint64_t ss = sss*Ls;
 | 
			
		||||
@@ -189,8 +189,8 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
 | 
			
		||||
  conformable(x,z);
 | 
			
		||||
  int Ls = grid->_rdimensions[0];
 | 
			
		||||
  Gamma G5(Gamma::Algebra::Gamma5);
 | 
			
		||||
  auto x_v = x.View();
 | 
			
		||||
  auto z_v = z.View();
 | 
			
		||||
  autoView( x_v, x, AcceleratorRead);
 | 
			
		||||
  autoView( z_v, z, AcceleratorWrite);
 | 
			
		||||
  uint64_t nloop = grid->oSites()/Ls;
 | 
			
		||||
  accelerator_for(sss,nloop,vobj::Nsimd(),{
 | 
			
		||||
    uint64_t ss = sss*Ls;
 | 
			
		||||
@@ -222,8 +222,8 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
 | 
			
		||||
  static_assert(nbasis % 2 == 0, "");
 | 
			
		||||
  int nb = nbasis / 2;
 | 
			
		||||
 | 
			
		||||
  auto z_v = z.View();
 | 
			
		||||
  auto x_v = x.View();
 | 
			
		||||
  autoView( z_v, z, AcceleratorWrite);
 | 
			
		||||
  autoView( x_v, x, AcceleratorRead);
 | 
			
		||||
  accelerator_for(ss,grid->oSites(),CComplex::Nsimd(),
 | 
			
		||||
  {
 | 
			
		||||
    for(int n = 0; n < nb; ++n) {
 | 
			
		||||
 
 | 
			
		||||
@@ -222,11 +222,11 @@ public:
 | 
			
		||||
    conformable(subgroup, Determinant);
 | 
			
		||||
    int i0, i1;
 | 
			
		||||
    su2SubGroupIndex(i0, i1, su2_index);
 | 
			
		||||
    auto subgroup_v = subgroup.View();
 | 
			
		||||
    auto source_v   = source.View();
 | 
			
		||||
    auto Determinant_v = Determinant.View();
 | 
			
		||||
 | 
			
		||||
    thread_for(ss, grid->oSites(), {
 | 
			
		||||
    autoView( subgroup_v , subgroup,AcceleratorWrite);
 | 
			
		||||
    autoView( source_v   , source,AcceleratorRead);
 | 
			
		||||
    autoView( Determinant_v , Determinant,AcceleratorWrite);
 | 
			
		||||
    accelerator_for(ss, grid->oSites(), 1, {
 | 
			
		||||
 | 
			
		||||
      subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0);
 | 
			
		||||
      subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1);
 | 
			
		||||
@@ -257,15 +257,16 @@ public:
 | 
			
		||||
    su2SubGroupIndex(i0, i1, su2_index);
 | 
			
		||||
 | 
			
		||||
    dest = 1.0;  // start out with identity
 | 
			
		||||
    auto dest_v = dest.View();
 | 
			
		||||
    auto subgroup_v = subgroup.View();
 | 
			
		||||
    thread_for(ss, grid->oSites(),
 | 
			
		||||
    autoView( dest_v , dest, AcceleratorWrite);
 | 
			
		||||
    autoView( subgroup_v, subgroup, AcceleratorRead);
 | 
			
		||||
    accelerator_for(ss, grid->oSites(),1,
 | 
			
		||||
    {
 | 
			
		||||
      dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);
 | 
			
		||||
      dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1);
 | 
			
		||||
      dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0);
 | 
			
		||||
      dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1);
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////
 | 
			
		||||
@@ -608,8 +609,8 @@ public:
 | 
			
		||||
 | 
			
		||||
  // reunitarise??
 | 
			
		||||
  template <typename LatticeMatrixType>
 | 
			
		||||
  static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out,
 | 
			
		||||
                           double scale = 1.0) {
 | 
			
		||||
  static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, double scale = 1.0) 
 | 
			
		||||
  {
 | 
			
		||||
    GridBase *grid = out.Grid();
 | 
			
		||||
 | 
			
		||||
    typedef typename LatticeMatrixType::vector_type vector_type;
 | 
			
		||||
@@ -618,8 +619,7 @@ public:
 | 
			
		||||
    typedef iSinglet<vector_type> vTComplexType;
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<vTComplexType> LatticeComplexType;
 | 
			
		||||
    typedef typename GridTypeMapper<
 | 
			
		||||
      typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
 | 
			
		||||
    typedef typename GridTypeMapper<typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
 | 
			
		||||
 | 
			
		||||
    LatticeComplexType ca(grid);
 | 
			
		||||
    LatticeMatrixType lie(grid);
 | 
			
		||||
@@ -629,6 +629,7 @@ public:
 | 
			
		||||
    MatrixType ta;
 | 
			
		||||
 | 
			
		||||
    lie = Zero();
 | 
			
		||||
 | 
			
		||||
    for (int a = 0; a < AdjointDimension; a++) {
 | 
			
		||||
      random(pRNG, ca);
 | 
			
		||||
 | 
			
		||||
@@ -640,6 +641,7 @@ public:
 | 
			
		||||
      la = ci * ca * ta;
 | 
			
		||||
 | 
			
		||||
      lie = lie + la;  // e^{i la ta}
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    taExp(lie, out);
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include "BinaryIO.h"
 | 
			
		||||
#include "TextIO.h"
 | 
			
		||||
#include "XmlIO.h"
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
 | 
			
		||||
#include "JSON_IO.h"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -32,7 +32,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
*/
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
#include <cuda_fp16.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
#include <hip/hip_fp16.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
@@ -142,7 +147,7 @@ typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI;
 | 
			
		||||
accelerator_inline float half2float(half h)
 | 
			
		||||
{
 | 
			
		||||
  float f;
 | 
			
		||||
#ifdef __CUDA_ARCH__
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
  f = __half2float(h);
 | 
			
		||||
#else 
 | 
			
		||||
  //f = __half2float(h);
 | 
			
		||||
@@ -156,7 +161,7 @@ accelerator_inline float half2float(half h)
 | 
			
		||||
accelerator_inline half float2half(float f)
 | 
			
		||||
{
 | 
			
		||||
  half h;
 | 
			
		||||
#ifdef __CUDA_ARCH__
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
  h = __float2half(f);
 | 
			
		||||
#else
 | 
			
		||||
  Grid_half hh = sfw_float_to_half(f);
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ directory
 | 
			
		||||
#ifndef GRID_SIMD_H
 | 
			
		||||
#define GRID_SIMD_H
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
#include <thrust/complex.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@@ -65,7 +65,7 @@ typedef RealD   Real;
 | 
			
		||||
typedef RealF  Real;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
typedef thrust::complex<RealF> ComplexF;
 | 
			
		||||
typedef thrust::complex<RealD> ComplexD;
 | 
			
		||||
typedef thrust::complex<Real>  Complex;
 | 
			
		||||
 
 | 
			
		||||
@@ -67,7 +67,8 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
 | 
			
		||||
{
 | 
			
		||||
  int num=table.size();
 | 
			
		||||
  std::pair<int,int> *table_v = & table[0];
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
 | 
			
		||||
  auto rhs_v = rhs.View(AcceleratorRead);
 | 
			
		||||
  accelerator_forNB( i,num, vobj::Nsimd(), {
 | 
			
		||||
    typedef decltype(coalescedRead(buffer[0])) compressed_t;
 | 
			
		||||
    compressed_t   tmp_c;
 | 
			
		||||
@@ -75,6 +76,7 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
 | 
			
		||||
    compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));
 | 
			
		||||
    coalescedWrite(buffer[off+o],tmp_c);
 | 
			
		||||
  });
 | 
			
		||||
  rhs_v.ViewClose();
 | 
			
		||||
// Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -94,7 +96,7 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
 | 
			
		||||
  int num=table.size()/2;
 | 
			
		||||
  int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
 | 
			
		||||
  auto rhs_v = rhs.View();
 | 
			
		||||
  auto rhs_v = rhs.View(AcceleratorRead);
 | 
			
		||||
  auto p0=&pointers[0][0];
 | 
			
		||||
  auto p1=&pointers[1][0];
 | 
			
		||||
  auto tp=&table[0];
 | 
			
		||||
@@ -104,10 +106,11 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
 | 
			
		||||
			      so+tp[2*j+1].second,
 | 
			
		||||
			      type);
 | 
			
		||||
  });
 | 
			
		||||
  rhs_v.ViewClose();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct StencilEntry { 
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
  uint64_t _byte_offset;       // 8 bytes 
 | 
			
		||||
  uint32_t _offset;            // 4 bytes 
 | 
			
		||||
#else
 | 
			
		||||
@@ -122,7 +125,7 @@ struct StencilEntry {
 | 
			
		||||
// Could pack to 8 + 4 + 4 = 128 bit and use 
 | 
			
		||||
 | 
			
		||||
template<class vobj,class cobj,class Parameters>
 | 
			
		||||
class CartesianStencilView {
 | 
			
		||||
class CartesianStencilAccelerator {
 | 
			
		||||
 public:
 | 
			
		||||
  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
 | 
			
		||||
 | 
			
		||||
@@ -130,14 +133,15 @@ class CartesianStencilView {
 | 
			
		||||
  ////////////////////////////////////////
 | 
			
		||||
  // Basic Grid and stencil info
 | 
			
		||||
  ////////////////////////////////////////
 | 
			
		||||
  int                               _checkerboard;
 | 
			
		||||
  int                               _npoints; // Move to template param?
 | 
			
		||||
  int           _checkerboard;
 | 
			
		||||
  int           _npoints; // Move to template param?
 | 
			
		||||
  int           _osites;
 | 
			
		||||
  StencilVector _directions;
 | 
			
		||||
  StencilVector _distances;
 | 
			
		||||
  StencilVector _comm_buf_size;
 | 
			
		||||
  StencilVector _permute_type;
 | 
			
		||||
  StencilVector same_node;
 | 
			
		||||
  Coordinate                         _simd_layout;
 | 
			
		||||
  Coordinate    _simd_layout;
 | 
			
		||||
  Parameters    parameters;
 | 
			
		||||
  StencilEntry*  _entries_p;
 | 
			
		||||
  cobj* u_recv_buf_p;
 | 
			
		||||
@@ -175,13 +179,43 @@ class CartesianStencilView {
 | 
			
		||||
  {
 | 
			
		||||
    Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class vobj,class cobj,class Parameters>
 | 
			
		||||
class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parameters> 
 | 
			
		||||
{
 | 
			
		||||
 private:
 | 
			
		||||
  int *closed;
 | 
			
		||||
  StencilEntry *cpu_ptr;
 | 
			
		||||
  ViewMode      mode;
 | 
			
		||||
 public:
 | 
			
		||||
  // default copy constructor
 | 
			
		||||
  CartesianStencilView (const CartesianStencilView &refer_to_me) = default;
 | 
			
		||||
 | 
			
		||||
  CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode) 
 | 
			
		||||
    : CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me),
 | 
			
		||||
    cpu_ptr(this->_entries_p),
 | 
			
		||||
    mode(_mode)
 | 
			
		||||
  {
 | 
			
		||||
    this->_entries_p =(StencilEntry *)
 | 
			
		||||
      MemoryManager::ViewOpen(this->_entries_p,
 | 
			
		||||
			      this->_npoints*this->_osites*sizeof(StencilEntry),
 | 
			
		||||
			      mode,
 | 
			
		||||
			      AdviseDefault);    
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  void ViewClose(void)
 | 
			
		||||
  {
 | 
			
		||||
    MemoryManager::ViewClose(this->cpu_ptr,this->mode);    
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////
 | 
			
		||||
// The Stencil Class itself
 | 
			
		||||
////////////////////////////////////////
 | 
			
		||||
template<class vobj,class cobj,class Parameters>
 | 
			
		||||
class CartesianStencil : public CartesianStencilView<vobj,cobj,Parameters> { // Stencil runs along coordinate axes only; NO diagonal fill in.
 | 
			
		||||
class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters> { // Stencil runs along coordinate axes only; NO diagonal fill in.
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  typedef typename cobj::vector_type vector_type;
 | 
			
		||||
@@ -226,8 +260,8 @@ public:
 | 
			
		||||
  // Generalise as required later if needed
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  View_type View(void) const {
 | 
			
		||||
    View_type accessor(*( (View_type *) this));
 | 
			
		||||
  View_type View(ViewMode mode) const {
 | 
			
		||||
    View_type accessor(*( (View_type *) this),mode);
 | 
			
		||||
    return accessor;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
@@ -662,9 +696,9 @@ public:
 | 
			
		||||
    _unified_buffer_size=0;
 | 
			
		||||
    surface_list.resize(0);
 | 
			
		||||
 | 
			
		||||
    int osites  = _grid->oSites();
 | 
			
		||||
    this->_osites  = _grid->oSites();
 | 
			
		||||
    
 | 
			
		||||
    _entries.resize(this->_npoints* osites);
 | 
			
		||||
    _entries.resize(this->_npoints* this->_osites);
 | 
			
		||||
    this->_entries_p = &_entries[0];
 | 
			
		||||
    for(int ii=0;ii<npoints;ii++){
 | 
			
		||||
      
 | 
			
		||||
 
 | 
			
		||||
@@ -31,22 +31,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
//accelerator_inline void SIMTsynchronise(void) 
 | 
			
		||||
accelerator_inline void synchronise(void) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef __CUDA_ARCH__
 | 
			
		||||
//  __syncthreads();
 | 
			
		||||
  __syncwarp();
 | 
			
		||||
#endif
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifndef __CUDA_ARCH__
 | 
			
		||||
#ifndef GRID_SIMT
 | 
			
		||||
//////////////////////////////////////////
 | 
			
		||||
// Trivial mapping of vectors on host
 | 
			
		||||
//////////////////////////////////////////
 | 
			
		||||
accelerator_inline int SIMTlane(int Nsimd) { return 0; } // CUDA specific
 | 
			
		||||
 | 
			
		||||
template<class vobj> accelerator_inline
 | 
			
		||||
vobj coalescedRead(const vobj & __restrict__ vec,int lane=0)
 | 
			
		||||
{
 | 
			
		||||
@@ -66,7 +55,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
 | 
			
		||||
template<class vobj> accelerator_inline
 | 
			
		||||
void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
 | 
			
		||||
{
 | 
			
		||||
  //  vstream(vec, extracted);
 | 
			
		||||
  vec = extracted;
 | 
			
		||||
}
 | 
			
		||||
template<class vobj> accelerator_inline
 | 
			
		||||
@@ -75,25 +63,24 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 | 
			
		||||
  vstream(vec, extracted);
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////
 | 
			
		||||
// Extract and insert slices on the GPU
 | 
			
		||||
//////////////////////////////////////////
 | 
			
		||||
template<class vobj> accelerator_inline
 | 
			
		||||
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=SIMTlane(vobj::Nsimd()))
 | 
			
		||||
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 | 
			
		||||
{
 | 
			
		||||
  return extractLane(lane,vec);
 | 
			
		||||
}
 | 
			
		||||
template<class vobj> accelerator_inline
 | 
			
		||||
typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=SIMTlane(vobj::Nsimd()))
 | 
			
		||||
typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 | 
			
		||||
{
 | 
			
		||||
  int mask = vobj::Nsimd() >> (ptype + 1);		
 | 
			
		||||
  int plane= doperm ? lane ^ mask : lane;
 | 
			
		||||
  return extractLane(plane,vec);
 | 
			
		||||
}
 | 
			
		||||
template<class vobj> accelerator_inline
 | 
			
		||||
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=SIMTlane(vobj::Nsimd()))
 | 
			
		||||
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
 | 
			
		||||
{
 | 
			
		||||
  insertLane(lane,vec,extracted);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -59,6 +59,20 @@ class GridTensorBase {};
 | 
			
		||||
  using DoublePrecision2= typename Traits::DoublePrecision2; \
 | 
			
		||||
  static constexpr int TensorLevel = Traits::TensorLevel
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////
 | 
			
		||||
// Allows to turn scalar<scalar<scalar<double>>>> back to double.
 | 
			
		||||
///////////////////////////////////////////////////////////
 | 
			
		||||
template <class T>
 | 
			
		||||
accelerator_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
 | 
			
		||||
TensorRemove(T arg) {
 | 
			
		||||
  return arg;
 | 
			
		||||
}
 | 
			
		||||
template <class vtype>
 | 
			
		||||
accelerator_inline auto TensorRemove(iScalar<vtype> arg)
 | 
			
		||||
  -> decltype(TensorRemove(arg._internal)) {
 | 
			
		||||
  return TensorRemove(arg._internal);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class vtype>
 | 
			
		||||
class iScalar {
 | 
			
		||||
public:
 | 
			
		||||
@@ -135,9 +149,10 @@ public:
 | 
			
		||||
  operator ComplexD() const {
 | 
			
		||||
    return (TensorRemove(_internal));
 | 
			
		||||
  }
 | 
			
		||||
  //             instantiation of "Grid::iScalar<vtype>::operator Grid::RealD() const [with vtype=Grid::Real, U=Grid::Real, V=Grid::RealD, <unnamed>=0, <unnamed>=0U]" 
 | 
			
		||||
  template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0> accelerator_inline
 | 
			
		||||
  operator RealD() const {
 | 
			
		||||
    return TensorRemove(_internal);
 | 
			
		||||
    return (RealD) TensorRemove(_internal);
 | 
			
		||||
  }
 | 
			
		||||
  template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0> accelerator_inline
 | 
			
		||||
  operator Integer() const {
 | 
			
		||||
@@ -169,20 +184,6 @@ public:
 | 
			
		||||
  strong_inline       scalar_type * end()         { return begin() + Traits::count; }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////
 | 
			
		||||
// Allows to turn scalar<scalar<scalar<double>>>> back to double.
 | 
			
		||||
///////////////////////////////////////////////////////////
 | 
			
		||||
template <class T>
 | 
			
		||||
accelerator_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
 | 
			
		||||
TensorRemove(T arg) {
 | 
			
		||||
  return arg;
 | 
			
		||||
}
 | 
			
		||||
template <class vtype>
 | 
			
		||||
accelerator_inline auto TensorRemove(iScalar<vtype> arg)
 | 
			
		||||
  -> decltype(TensorRemove(arg._internal)) {
 | 
			
		||||
  return TensorRemove(arg._internal);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class vtype, int N>
 | 
			
		||||
class iVector {
 | 
			
		||||
public:
 | 
			
		||||
 
 | 
			
		||||
@@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// Specialisation: Cayley-Hamilton exponential for SU(3)
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
 | 
			
		||||
accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										207
									
								
								Grid/threads/Accelerator.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										207
									
								
								Grid/threads/Accelerator.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,207 @@
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
uint32_t accelerator_threads=2;
 | 
			
		||||
uint32_t acceleratorThreads(void)       {return accelerator_threads;};
 | 
			
		||||
void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
cudaDeviceProp *gpu_props;
 | 
			
		||||
void acceleratorInit(void)
 | 
			
		||||
{
 | 
			
		||||
  int nDevices = 1;
 | 
			
		||||
  cudaGetDeviceCount(&nDevices);
 | 
			
		||||
  gpu_props = new cudaDeviceProp[nDevices];
 | 
			
		||||
 | 
			
		||||
  char * localRankStr = NULL;
 | 
			
		||||
  int rank = 0, world_rank=0; 
 | 
			
		||||
#define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 | 
			
		||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 | 
			
		||||
#define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 | 
			
		||||
#define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
 | 
			
		||||
  // We extract the local rank initialization using an environment variable
 | 
			
		||||
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
 | 
			
		||||
  {
 | 
			
		||||
    rank = atoi(localRankStr);		
 | 
			
		||||
  }
 | 
			
		||||
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
 | 
			
		||||
  {
 | 
			
		||||
    rank = atoi(localRankStr);		
 | 
			
		||||
  }
 | 
			
		||||
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
 | 
			
		||||
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
 | 
			
		||||
 | 
			
		||||
  size_t totalDeviceMem=0;
 | 
			
		||||
  for (int i = 0; i < nDevices; i++) {
 | 
			
		||||
 | 
			
		||||
#define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorCudaInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 | 
			
		||||
#define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
 | 
			
		||||
    cudaGetDeviceProperties(&gpu_props[i], i);
 | 
			
		||||
    cudaDeviceProp prop; 
 | 
			
		||||
    prop = gpu_props[i];
 | 
			
		||||
    totalDeviceMem = prop.totalGlobalMem;
 | 
			
		||||
    if ( world_rank == 0) {
 | 
			
		||||
      printf("AcceleratorCudaInit: ========================\n");
 | 
			
		||||
      printf("AcceleratorCudaInit: Device Number    : %d\n", i);
 | 
			
		||||
      printf("AcceleratorCudaInit: ========================\n");
 | 
			
		||||
      printf("AcceleratorCudaInit: Device identifier: %s\n", prop.name);
 | 
			
		||||
 | 
			
		||||
      GPU_PROP_FMT(totalGlobalMem,"%lld");
 | 
			
		||||
      GPU_PROP(managedMemory);
 | 
			
		||||
      GPU_PROP(isMultiGpuBoard);
 | 
			
		||||
      GPU_PROP(warpSize);
 | 
			
		||||
      //      GPU_PROP(unifiedAddressing);
 | 
			
		||||
      //      GPU_PROP(l2CacheSize);
 | 
			
		||||
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 | 
			
		||||
#undef GPU_PROP_FMT    
 | 
			
		||||
#undef GPU_PROP
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_IBM_SUMMIT
 | 
			
		||||
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
 | 
			
		||||
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: IBM Summit or similar - NOT setting device to node rank\n");
 | 
			
		||||
#else
 | 
			
		||||
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: setting device to node rank\n");
 | 
			
		||||
  cudaSetDevice(rank);
 | 
			
		||||
#endif
 | 
			
		||||
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
hipDeviceProp_t *gpu_props;
 | 
			
		||||
void acceleratorInit(void)
 | 
			
		||||
{
 | 
			
		||||
  int nDevices = 1;
 | 
			
		||||
  hipGetDeviceCount(&nDevices);
 | 
			
		||||
  gpu_props = new hipDeviceProp_t[nDevices];
 | 
			
		||||
 | 
			
		||||
  char * localRankStr = NULL;
 | 
			
		||||
  int rank = 0, world_rank=0; 
 | 
			
		||||
#define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 | 
			
		||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 | 
			
		||||
#define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 | 
			
		||||
#define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
 | 
			
		||||
  // We extract the local rank initialization using an environment variable
 | 
			
		||||
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
 | 
			
		||||
  {
 | 
			
		||||
    rank = atoi(localRankStr);		
 | 
			
		||||
  }
 | 
			
		||||
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
 | 
			
		||||
  {
 | 
			
		||||
    rank = atoi(localRankStr);		
 | 
			
		||||
  }
 | 
			
		||||
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
 | 
			
		||||
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < nDevices; i++) {
 | 
			
		||||
 | 
			
		||||
#define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 | 
			
		||||
#define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
 | 
			
		||||
    
 | 
			
		||||
    hipGetDeviceProperties(&gpu_props[i], i);
 | 
			
		||||
    if ( world_rank == 0) {
 | 
			
		||||
      hipDeviceProp_t prop; 
 | 
			
		||||
      prop = gpu_props[i];
 | 
			
		||||
      printf("AcceleratorHipInit: ========================\n");
 | 
			
		||||
      printf("AcceleratorHipInit: Device Number    : %d\n", i);
 | 
			
		||||
      printf("AcceleratorHipInit: ========================\n");
 | 
			
		||||
      printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
 | 
			
		||||
 | 
			
		||||
      //      GPU_PROP(managedMemory);
 | 
			
		||||
      GPU_PROP(isMultiGpuBoard);
 | 
			
		||||
      GPU_PROP(warpSize);
 | 
			
		||||
      //      GPU_PROP(unifiedAddressing);
 | 
			
		||||
      //      GPU_PROP(l2CacheSize);
 | 
			
		||||
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
#undef GPU_PROP_FMT    
 | 
			
		||||
#undef GPU_PROP
 | 
			
		||||
#ifdef GRID_IBM_SUMMIT
 | 
			
		||||
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
 | 
			
		||||
  if ( world_rank == 0 )  printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n");
 | 
			
		||||
#else
 | 
			
		||||
  if ( world_rank == 0 )  printf("AcceleratorHipInit: setting device to node rank\n");
 | 
			
		||||
  hipSetDevice(rank);
 | 
			
		||||
#endif
 | 
			
		||||
  if ( world_rank == 0 )  printf("AcceleratorHipInit: ================================================\n");
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
 | 
			
		||||
cl::sycl::queue *theGridAccelerator;
 | 
			
		||||
 | 
			
		||||
void acceleratorInit(void)
 | 
			
		||||
{
 | 
			
		||||
  int nDevices = 1;
 | 
			
		||||
  cl::sycl::gpu_selector selector;
 | 
			
		||||
  cl::sycl::device selectedDevice { selector };
 | 
			
		||||
  theGridAccelerator = new sycl::queue (selectedDevice);
 | 
			
		||||
 | 
			
		||||
  char * localRankStr = NULL;
 | 
			
		||||
  int rank = 0, world_rank=0; 
 | 
			
		||||
#define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 | 
			
		||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 | 
			
		||||
#define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 | 
			
		||||
#define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
 | 
			
		||||
  // We extract the local rank initialization using an environment variable
 | 
			
		||||
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
 | 
			
		||||
  {
 | 
			
		||||
    rank = atoi(localRankStr);		
 | 
			
		||||
  }
 | 
			
		||||
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
 | 
			
		||||
  {
 | 
			
		||||
    rank = atoi(localRankStr);		
 | 
			
		||||
  }
 | 
			
		||||
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
 | 
			
		||||
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
 | 
			
		||||
 | 
			
		||||
  auto devices = cl::sycl::device::get_devices();
 | 
			
		||||
  for(int d = 0;d<devices.size();d++){
 | 
			
		||||
 | 
			
		||||
#define GPU_PROP_STR(prop) \
 | 
			
		||||
    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
 | 
			
		||||
 | 
			
		||||
#define GPU_PROP_FMT(prop,FMT) \
 | 
			
		||||
    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
 | 
			
		||||
 | 
			
		||||
#define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld");
 | 
			
		||||
 | 
			
		||||
    GPU_PROP_STR(vendor);
 | 
			
		||||
    GPU_PROP_STR(version);
 | 
			
		||||
    //    GPU_PROP_STR(device_type);
 | 
			
		||||
    /*
 | 
			
		||||
    GPU_PROP(max_compute_units);
 | 
			
		||||
    GPU_PROP(native_vector_width_char);
 | 
			
		||||
    GPU_PROP(native_vector_width_short);
 | 
			
		||||
    GPU_PROP(native_vector_width_int);
 | 
			
		||||
    GPU_PROP(native_vector_width_long);
 | 
			
		||||
    GPU_PROP(native_vector_width_float);
 | 
			
		||||
    GPU_PROP(native_vector_width_double);
 | 
			
		||||
    GPU_PROP(native_vector_width_half);
 | 
			
		||||
    GPU_PROP(address_bits);
 | 
			
		||||
    GPU_PROP(half_fp_config);
 | 
			
		||||
    GPU_PROP(single_fp_config);
 | 
			
		||||
    */
 | 
			
		||||
    //    GPU_PROP(double_fp_config);
 | 
			
		||||
    GPU_PROP(global_mem_size);
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
  if ( world_rank == 0 ) {
 | 
			
		||||
    auto name = theGridAccelerator->get_device().get_info<sycl::info::device::name>();
 | 
			
		||||
    printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str());
 | 
			
		||||
    printf("AcceleratorSyclInit: ================================================\n");
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
 | 
			
		||||
void acceleratorInit(void){}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
							
								
								
									
										426
									
								
								Grid/threads/Accelerator.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										426
									
								
								Grid/threads/Accelerator.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,426 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/Accelerator.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#include <string.h>
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_MALLOC_MALLOC_H
 | 
			
		||||
#include <malloc/malloc.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef HAVE_MALLOC_H
 | 
			
		||||
#include <malloc.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
#include <mm_malloc.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Accelerator primitives; fall back to threading if not CUDA or SYCL
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//
 | 
			
		||||
// Function attributes
 | 
			
		||||
//
 | 
			
		||||
//    accelerator
 | 
			
		||||
//    accelerator_inline
 | 
			
		||||
//
 | 
			
		||||
// Parallel looping
 | 
			
		||||
// 
 | 
			
		||||
//    accelerator_for
 | 
			
		||||
//    accelerator_forNB 
 | 
			
		||||
//    uint32_t accelerator_barrier();         // device synchronise
 | 
			
		||||
//
 | 
			
		||||
// Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd
 | 
			
		||||
//
 | 
			
		||||
//    uint32_t acceleratorThreads(void);   
 | 
			
		||||
//    void     acceleratorThreads(uint32_t);
 | 
			
		||||
//
 | 
			
		||||
// Warp control and info:
 | 
			
		||||
//
 | 
			
		||||
//    acceleratorInit;
 | 
			
		||||
//    void     acceleratorSynchronise(void); // synch warp etc..
 | 
			
		||||
//    int      acceleratorSIMTlane(int Nsimd);
 | 
			
		||||
//
 | 
			
		||||
// Memory management:
 | 
			
		||||
//
 | 
			
		||||
//    void *acceleratorAllocShared(size_t bytes);
 | 
			
		||||
//    void acceleratorFreeShared(void *ptr);
 | 
			
		||||
//
 | 
			
		||||
//    void *acceleratorAllocDevice(size_t bytes);
 | 
			
		||||
//    void acceleratorFreeDevice(void *ptr);
 | 
			
		||||
//
 | 
			
		||||
//    void *acceleratorCopyToDevice(void *from,void *to,size_t bytes);
 | 
			
		||||
//    void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes);
 | 
			
		||||
//
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
uint32_t acceleratorThreads(void);   
 | 
			
		||||
void     acceleratorThreads(uint32_t);
 | 
			
		||||
void     acceleratorInit(void);
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
// CUDA acceleration
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
 | 
			
		||||
#ifdef __CUDA_ARCH__
 | 
			
		||||
#define GRID_SIMT
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define accelerator        __host__ __device__
 | 
			
		||||
#define accelerator_inline __host__ __device__ inline
 | 
			
		||||
 | 
			
		||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
  return threadIdx.z; 
 | 
			
		||||
#else
 | 
			
		||||
  return 0;
 | 
			
		||||
#endif
 | 
			
		||||
} // CUDA specific
 | 
			
		||||
 | 
			
		||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef uint64_t Iterator;						\
 | 
			
		||||
    auto lambda = [=] accelerator					\
 | 
			
		||||
      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
 | 
			
		||||
      __VA_ARGS__;							\
 | 
			
		||||
    };									\
 | 
			
		||||
    int nt=acceleratorThreads();					\
 | 
			
		||||
    dim3 cu_threads(acceleratorThreads(),1,nsimd);			\
 | 
			
		||||
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
 | 
			
		||||
    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
template<typename lambda>  __global__
 | 
			
		||||
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
 | 
			
		||||
{
 | 
			
		||||
  uint64_t x = threadIdx.x + blockDim.x*blockIdx.x;
 | 
			
		||||
  uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
 | 
			
		||||
  uint64_t z = threadIdx.z;
 | 
			
		||||
  if ( (x < num1) && (y<num2) && (z<num3) ) {
 | 
			
		||||
    Lambda(x,y,z);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define accelerator_barrier(dummy)					\
 | 
			
		||||
  {									\
 | 
			
		||||
    cudaDeviceSynchronize();						\
 | 
			
		||||
    cudaError err = cudaGetLastError();					\
 | 
			
		||||
    if ( cudaSuccess != err ) {						\
 | 
			
		||||
      printf("Cuda error %s \n", cudaGetErrorString( err ));		\
 | 
			
		||||
      puts(__FILE__);							\
 | 
			
		||||
      printf("Line %d\n",__LINE__);					\
 | 
			
		||||
    }									\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
inline void *acceleratorAllocShared(size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  void *ptr=NULL;
 | 
			
		||||
  auto err = cudaMallocManaged((void **)&ptr,bytes);
 | 
			
		||||
  if( err != cudaSuccess ) {
 | 
			
		||||
    ptr = (void *) NULL;
 | 
			
		||||
    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
 | 
			
		||||
  }
 | 
			
		||||
  return ptr;
 | 
			
		||||
};
 | 
			
		||||
inline void *acceleratorAllocDevice(size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  void *ptr=NULL;
 | 
			
		||||
  auto err = cudaMalloc((void **)&ptr,bytes);
 | 
			
		||||
  if( err != cudaSuccess ) {
 | 
			
		||||
    ptr = (void *) NULL;
 | 
			
		||||
    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
 | 
			
		||||
  }
 | 
			
		||||
  return ptr;
 | 
			
		||||
};
 | 
			
		||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 | 
			
		||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 | 
			
		||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 | 
			
		||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
// SyCL acceleration
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
#include <CL/sycl.hpp>
 | 
			
		||||
#include <CL/sycl/usm.hpp>
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
extern cl::sycl::queue *theGridAccelerator;
 | 
			
		||||
 | 
			
		||||
#ifdef __SYCL_DEVICE_ONLY__
 | 
			
		||||
#define GRID_SIMT
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define accelerator 
 | 
			
		||||
#define accelerator_inline strong_inline
 | 
			
		||||
 | 
			
		||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
 return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; 
 | 
			
		||||
#else
 | 
			
		||||
 return 0;
 | 
			
		||||
#endif
 | 
			
		||||
} // SYCL specific
 | 
			
		||||
 | 
			
		||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
 | 
			
		||||
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
 | 
			
		||||
      unsigned long nt=acceleratorThreads();				\
 | 
			
		||||
      unsigned long unum1 = num1;					\
 | 
			
		||||
      unsigned long unum2 = num2;					\
 | 
			
		||||
      cl::sycl::range<3> local {nt,1,nsimd};				\
 | 
			
		||||
      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
 | 
			
		||||
      cgh.parallel_for<class dslash>(					\
 | 
			
		||||
      cl::sycl::nd_range<3>(global,local), \
 | 
			
		||||
      [=] (cl::sycl::nd_item<3> item) mutable {       \
 | 
			
		||||
      auto iter1    = item.get_global_id(0);	      \
 | 
			
		||||
      auto iter2    = item.get_global_id(1);	      \
 | 
			
		||||
      auto lane     = item.get_global_id(2);	      \
 | 
			
		||||
      { __VA_ARGS__ };				      \
 | 
			
		||||
     });	   			              \
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
#define accelerator_barrier(dummy) theGridAccelerator->wait();
 | 
			
		||||
 | 
			
		||||
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
 | 
			
		||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
 | 
			
		||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 | 
			
		||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 | 
			
		||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 | 
			
		||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
// HIP acceleration
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
#include <hip/hip_runtime.h>
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
#ifdef __HIP_DEVICE_COMPILE__
 | 
			
		||||
#define GRID_SIMT
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define accelerator        __host__ __device__
 | 
			
		||||
#define accelerator_inline __host__ __device__ inline
 | 
			
		||||
 | 
			
		||||
/*These routines define mapping from thread grid to loop & vector lane indexing */
 | 
			
		||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
  return hipThreadIdx_z; 
 | 
			
		||||
#else
 | 
			
		||||
  return 0;
 | 
			
		||||
#endif
 | 
			
		||||
} // HIP specific
 | 
			
		||||
 | 
			
		||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef uint64_t Iterator;						\
 | 
			
		||||
    auto lambda = [=] accelerator					\
 | 
			
		||||
      (Iterator iter1,Iterator iter2,Iterator lane ) mutable {		\
 | 
			
		||||
      { __VA_ARGS__;}							\
 | 
			
		||||
    };									\
 | 
			
		||||
    int nt=acceleratorThreads();					\
 | 
			
		||||
    dim3 hip_threads(nt,1,nsimd);					\
 | 
			
		||||
    dim3 hip_blocks ((num1+nt-1)/nt,num2,1);				\
 | 
			
		||||
    hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,		\
 | 
			
		||||
		       0,0,						\
 | 
			
		||||
		       num1,num2,nsimd,lambda);				\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
template<typename lambda>  __global__
 | 
			
		||||
void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 | 
			
		||||
{
 | 
			
		||||
  uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
 | 
			
		||||
  uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
 | 
			
		||||
  uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
 | 
			
		||||
  if ( (x < numx) && (y<numy) && (z<numz) ) {
 | 
			
		||||
    Lambda(x,y,z);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define accelerator_barrier(dummy)				\
 | 
			
		||||
  {								\
 | 
			
		||||
    hipDeviceSynchronize();					\
 | 
			
		||||
    auto err = hipGetLastError();				\
 | 
			
		||||
    if ( err != hipSuccess ) {					\
 | 
			
		||||
      printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
 | 
			
		||||
      puts(__FILE__);							\
 | 
			
		||||
      printf("Line %d\n",__LINE__);				\
 | 
			
		||||
      exit(0);							\
 | 
			
		||||
    }								\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
inline void *acceleratorAllocShared(size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
#if 0
 | 
			
		||||
  void *ptr=NULL;
 | 
			
		||||
  auto err = hipMallocManaged((void **)&ptr,bytes);
 | 
			
		||||
  if( err != hipSuccess ) {
 | 
			
		||||
    ptr = (void *) NULL;
 | 
			
		||||
    printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
 | 
			
		||||
  }
 | 
			
		||||
  return ptr;
 | 
			
		||||
#else
 | 
			
		||||
  return malloc(bytes);
 | 
			
		||||
#endif
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
inline void *acceleratorAllocDevice(size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  void *ptr=NULL;
 | 
			
		||||
  auto err = hipMalloc((void **)&ptr,bytes);
 | 
			
		||||
  if( err != hipSuccess ) {
 | 
			
		||||
    ptr = (void *) NULL;
 | 
			
		||||
    printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
 | 
			
		||||
  }
 | 
			
		||||
  return ptr;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
inline void acceleratorFreeShared(void *ptr){ free(ptr);};
 | 
			
		||||
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
 | 
			
		||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
 | 
			
		||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
// Common on all GPU targets
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
 | 
			
		||||
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
 | 
			
		||||
 | 
			
		||||
#define accelerator_for( iter, num, nsimd, ... )		\
 | 
			
		||||
  accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } );	\
 | 
			
		||||
  accelerator_barrier(dummy);
 | 
			
		||||
 | 
			
		||||
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... )	\
 | 
			
		||||
  accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \
 | 
			
		||||
  accelerator_barrier(dummy);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
// CPU Target - No accelerator just thread instead
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned 
 | 
			
		||||
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
 | 
			
		||||
 | 
			
		||||
#undef GRID_SIMT
 | 
			
		||||
 | 
			
		||||
#define accelerator 
 | 
			
		||||
#define accelerator_inline strong_inline
 | 
			
		||||
#define accelerator_for(iterator,num,nsimd, ... )   thread_for(iterator, num, { __VA_ARGS__ });
 | 
			
		||||
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
 | 
			
		||||
#define accelerator_barrier(dummy) 
 | 
			
		||||
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
 | 
			
		||||
 | 
			
		||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 | 
			
		||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { memcpy(to,from,bytes);}
 | 
			
		||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 | 
			
		||||
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 | 
			
		||||
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
 | 
			
		||||
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
 | 
			
		||||
#else
 | 
			
		||||
inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
 | 
			
		||||
inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
 | 
			
		||||
inline void acceleratorFreeShared(void *ptr){free(ptr);};
 | 
			
		||||
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif // CPU target
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
inline void *acceleratorAllocCpu(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
 | 
			
		||||
inline void acceleratorFreeCpu  (void *ptr){_mm_free(ptr);};
 | 
			
		||||
#else
 | 
			
		||||
inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
 | 
			
		||||
inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////
 | 
			
		||||
// Synchronise across local threads for divergence resynch
 | 
			
		||||
///////////////////////////////////////////////////
 | 
			
		||||
accelerator_inline void acceleratorSynchronise(void) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
  __syncwarp();
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
  // No barrier call on SYCL??  // Option get __spir:: stuff to do warp barrier
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
accelerator_inline void acceleratorSynchroniseAll(void) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
  // No barrier call on SYCL??  // Option get __spir:: stuff to do warp barrier
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
accelerator_inline void acceleratorFence(void) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
  __threadfence();
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
  // FIXMEE
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
  __threadfence();
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
@@ -2,7 +2,7 @@
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/Threads.h
 | 
			
		||||
    Source file: ./lib/Pragmas.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
@@ -28,107 +28,5 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#pragma once
 | 
			
		||||
 | 
			
		||||
#ifndef MAX
 | 
			
		||||
#define MAX(x,y) ((x)>(y)?(x):(y))
 | 
			
		||||
#define MIN(x,y) ((x)>(y)?(y):(x))
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define strong_inline     __attribute__((always_inline)) inline
 | 
			
		||||
#define UNROLL  _Pragma("unroll")
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// New primitives; explicit host thread calls, and accelerator data parallel calls
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
#ifdef _OPENMP
 | 
			
		||||
#define GRID_OMP
 | 
			
		||||
#include <omp.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#define DO_PRAGMA_(x) _Pragma (#x)
 | 
			
		||||
#define DO_PRAGMA(x) DO_PRAGMA_(x)
 | 
			
		||||
#define thread_num(a) omp_get_thread_num()
 | 
			
		||||
#define thread_max(a) omp_get_max_threads()
 | 
			
		||||
#else 
 | 
			
		||||
#define DO_PRAGMA_(x) 
 | 
			
		||||
#define DO_PRAGMA(x) 
 | 
			
		||||
#define thread_num(a) (0)
 | 
			
		||||
#define thread_max(a) (1)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define thread_for( i, num, ... )                           DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_foreach( i, container, ... )                 DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_for_in_region( i, num, ... )                 DO_PRAGMA(omp for schedule(static))          for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_for_collapse2( i, num, ... )                 DO_PRAGMA(omp parallel for collapse(2))      for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_for_collapse( N , i, num, ... )              DO_PRAGMA(omp parallel for collapse ( N ) )  for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_for_collapse_in_region( N , i, num, ... )    DO_PRAGMA(omp for collapse ( N ))            for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_region                                       DO_PRAGMA(omp parallel)
 | 
			
		||||
#define thread_critical                                     DO_PRAGMA(omp critical)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Accelerator primitives; fall back to threading
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#ifdef __NVCC__
 | 
			
		||||
#define GRID_NVCC
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
 | 
			
		||||
extern uint32_t gpu_threads;
 | 
			
		||||
 | 
			
		||||
#define accelerator        __host__ __device__
 | 
			
		||||
#define accelerator_inline __host__ __device__ inline
 | 
			
		||||
 | 
			
		||||
template<typename lambda>  __global__
 | 
			
		||||
void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda)
 | 
			
		||||
{
 | 
			
		||||
  uint64_t isite = threadIdx.y;
 | 
			
		||||
  uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
 | 
			
		||||
  if ( (osite <Osites) && (isite<Isites) ) {
 | 
			
		||||
    Lambda(isite,osite);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////
 | 
			
		||||
// Internal only really... but need to call when 
 | 
			
		||||
/////////////////////////////////////////////////////////////////
 | 
			
		||||
#define accelerator_barrier(dummy)				\
 | 
			
		||||
  {								\
 | 
			
		||||
    cudaDeviceSynchronize();					\
 | 
			
		||||
    cudaError err = cudaGetLastError();				\
 | 
			
		||||
    if ( cudaSuccess != err ) {					\
 | 
			
		||||
      printf("Cuda error %s \n", cudaGetErrorString( err )); \
 | 
			
		||||
      puts(__FILE__); \
 | 
			
		||||
      printf("Line %d\n",__LINE__);					\
 | 
			
		||||
      exit(0);							\
 | 
			
		||||
    }								\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
// Copy the for_each_n style ; Non-blocking variant
 | 
			
		||||
#define accelerator_forNB( iterator, num, nsimd, ... )			\
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef uint64_t Iterator;						\
 | 
			
		||||
    auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
 | 
			
		||||
      __VA_ARGS__;							\
 | 
			
		||||
    };									\
 | 
			
		||||
    dim3 cu_threads(gpu_threads,nsimd);					\
 | 
			
		||||
    dim3 cu_blocks ((num+gpu_threads-1)/gpu_threads);			\
 | 
			
		||||
    LambdaApplySIMT<<<cu_blocks,cu_threads>>>(nsimd,num,lambda);	\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
// Copy the for_each_n style ; Non-blocking variant (default
 | 
			
		||||
#define accelerator_for( iterator, num, nsimd, ... )		\
 | 
			
		||||
  accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } );	\
 | 
			
		||||
  accelerator_barrier(dummy);
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
#define accelerator 
 | 
			
		||||
#define accelerator_inline strong_inline
 | 
			
		||||
#define accelerator_for(iterator,num,nsimd, ... )   thread_for(iterator, num, { __VA_ARGS__ });
 | 
			
		||||
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
 | 
			
		||||
#define accelerator_barrier(dummy) 
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
#include <Grid/threads/Threads.h>
 | 
			
		||||
#include <Grid/threads/Accelerator.h>
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										127
									
								
								Grid/threads/ThreadReduction.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										127
									
								
								Grid/threads/ThreadReduction.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,127 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/ThreadReduction.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#pragma once 
 | 
			
		||||
 | 
			
		||||
// Introduce a class to gain deterministic bit reproducible reduction.
 | 
			
		||||
// make static; perhaps just a namespace is required.
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
class GridThread {
 | 
			
		||||
public:
 | 
			
		||||
  static int _threads;
 | 
			
		||||
  static int _hyperthreads;
 | 
			
		||||
  static int _cores;
 | 
			
		||||
 | 
			
		||||
  static void SetCores(int cr) { 
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
    _cores = cr;
 | 
			
		||||
#else 
 | 
			
		||||
    _cores = 1;
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
  static void SetThreads(int thr) { 
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
    _threads = MIN(thr,omp_get_max_threads()) ;
 | 
			
		||||
    omp_set_num_threads(_threads);
 | 
			
		||||
#else 
 | 
			
		||||
    _threads = 1;
 | 
			
		||||
#endif
 | 
			
		||||
  };
 | 
			
		||||
  static void SetMaxThreads(void) { 
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
    _threads = omp_get_max_threads();
 | 
			
		||||
    omp_set_num_threads(_threads);
 | 
			
		||||
#else 
 | 
			
		||||
    _threads = 1;
 | 
			
		||||
#endif
 | 
			
		||||
  };
 | 
			
		||||
  static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
 | 
			
		||||
  static int GetCores(void)   { return _cores; };
 | 
			
		||||
  static int GetThreads(void) { return _threads; };
 | 
			
		||||
  static int SumArraySize(void) {return _threads;};
 | 
			
		||||
 | 
			
		||||
  static void GetWork(int nwork, int me, int & mywork, int & myoff){
 | 
			
		||||
    GetWork(nwork,me,mywork,myoff,_threads);
 | 
			
		||||
  }
 | 
			
		||||
  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
 | 
			
		||||
    int basework = nwork/units;
 | 
			
		||||
    int backfill = units-(nwork%units);
 | 
			
		||||
    if ( me >= units ) { 
 | 
			
		||||
      mywork = myoff = 0;
 | 
			
		||||
    } else { 
 | 
			
		||||
      mywork = (nwork+me)/units;
 | 
			
		||||
      myoff  = basework * me;
 | 
			
		||||
      if ( me > backfill ) 
 | 
			
		||||
	myoff+= (me-backfill);
 | 
			
		||||
    }
 | 
			
		||||
    return;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
 | 
			
		||||
    me     = ThreadBarrier();
 | 
			
		||||
    GetWork(nwork,me,mywork,myoff);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  static int  ThreadBarrier(void) {
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp barrier
 | 
			
		||||
    return omp_get_thread_num();
 | 
			
		||||
#else
 | 
			
		||||
    return 0;
 | 
			
		||||
#endif
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
 | 
			
		||||
    sum_array[me] = val;
 | 
			
		||||
    val=Zero();
 | 
			
		||||
    ThreadBarrier();
 | 
			
		||||
    for(int i=0;i<_threads;i++) val+= sum_array[i];
 | 
			
		||||
    ThreadBarrier();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static void bcopy(const void *src, void *dst, size_t len) {
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
    {
 | 
			
		||||
      const char *c_src =(char *) src;
 | 
			
		||||
      char *c_dest=(char *) dst;
 | 
			
		||||
      int me,mywork,myoff;
 | 
			
		||||
      GridThread::GetWorkBarrier(len,me, mywork,myoff);
 | 
			
		||||
      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
 | 
			
		||||
    }
 | 
			
		||||
#else 
 | 
			
		||||
    bcopy(src,dst,len);
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
@@ -28,101 +28,47 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#pragma once 
 | 
			
		||||
 | 
			
		||||
#ifndef MAX
 | 
			
		||||
#define MAX(x,y) ((x)>(y)?(x):(y))
 | 
			
		||||
#define MIN(x,y) ((x)>(y)?(y):(x))
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Introduce a class to gain deterministic bit reproducible reduction.
 | 
			
		||||
// make static; perhaps just a namespace is required.
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
#define strong_inline     __attribute__((always_inline)) inline
 | 
			
		||||
#define UNROLL  _Pragma("unroll")
 | 
			
		||||
 | 
			
		||||
class GridThread {
 | 
			
		||||
public:
 | 
			
		||||
  static int _threads;
 | 
			
		||||
  static int _hyperthreads;
 | 
			
		||||
  static int _cores;
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// New primitives; explicit host thread calls, and accelerator data parallel calls
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
#ifdef _OPENMP
 | 
			
		||||
#define GRID_OMP
 | 
			
		||||
#include <omp.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  static void SetCores(int cr) { 
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
    _cores = cr;
 | 
			
		||||
#define DO_PRAGMA_(x) _Pragma (#x)
 | 
			
		||||
#define DO_PRAGMA(x) DO_PRAGMA_(x)
 | 
			
		||||
#define thread_num(a) omp_get_thread_num()
 | 
			
		||||
#define thread_max(a) omp_get_max_threads()
 | 
			
		||||
#else 
 | 
			
		||||
    _cores = 1;
 | 
			
		||||
#define DO_PRAGMA_(x) 
 | 
			
		||||
#define DO_PRAGMA(x) 
 | 
			
		||||
#define thread_num(a) (0)
 | 
			
		||||
#define thread_max(a) (1)
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
  static void SetThreads(int thr) { 
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
    _threads = MIN(thr,omp_get_max_threads()) ;
 | 
			
		||||
    omp_set_num_threads(_threads);
 | 
			
		||||
#else 
 | 
			
		||||
    _threads = 1;
 | 
			
		||||
#endif
 | 
			
		||||
  };
 | 
			
		||||
  static void SetMaxThreads(void) { 
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
    _threads = omp_get_max_threads();
 | 
			
		||||
    omp_set_num_threads(_threads);
 | 
			
		||||
#else 
 | 
			
		||||
    _threads = 1;
 | 
			
		||||
#endif
 | 
			
		||||
  };
 | 
			
		||||
  static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
 | 
			
		||||
  static int GetCores(void)   { return _cores; };
 | 
			
		||||
  static int GetThreads(void) { return _threads; };
 | 
			
		||||
  static int SumArraySize(void) {return _threads;};
 | 
			
		||||
 | 
			
		||||
  static void GetWork(int nwork, int me, int & mywork, int & myoff){
 | 
			
		||||
    GetWork(nwork,me,mywork,myoff,_threads);
 | 
			
		||||
  }
 | 
			
		||||
  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
 | 
			
		||||
    int basework = nwork/units;
 | 
			
		||||
    int backfill = units-(nwork%units);
 | 
			
		||||
    if ( me >= units ) { 
 | 
			
		||||
      mywork = myoff = 0;
 | 
			
		||||
    } else { 
 | 
			
		||||
      mywork = (nwork+me)/units;
 | 
			
		||||
      myoff  = basework * me;
 | 
			
		||||
      if ( me > backfill ) 
 | 
			
		||||
	myoff+= (me-backfill);
 | 
			
		||||
    }
 | 
			
		||||
    return;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
 | 
			
		||||
    me     = ThreadBarrier();
 | 
			
		||||
    GetWork(nwork,me,mywork,myoff);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  static int  ThreadBarrier(void) {
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp barrier
 | 
			
		||||
    return omp_get_thread_num();
 | 
			
		||||
#else
 | 
			
		||||
    return 0;
 | 
			
		||||
#endif
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
 | 
			
		||||
    sum_array[me] = val;
 | 
			
		||||
    val=Zero();
 | 
			
		||||
    ThreadBarrier();
 | 
			
		||||
    for(int i=0;i<_threads;i++) val+= sum_array[i];
 | 
			
		||||
    ThreadBarrier();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static void bcopy(const void *src, void *dst, size_t len) {
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
    {
 | 
			
		||||
      const char *c_src =(char *) src;
 | 
			
		||||
      char *c_dest=(char *) dst;
 | 
			
		||||
      int me,mywork,myoff;
 | 
			
		||||
      GridThread::GetWorkBarrier(len,me, mywork,myoff);
 | 
			
		||||
      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
 | 
			
		||||
    }
 | 
			
		||||
#else 
 | 
			
		||||
    bcopy(src,dst,len);
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
#define thread_for( i, num, ... )                           DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_for2d( i1, n1,i2,n2, ... )  \
 | 
			
		||||
  DO_PRAGMA(omp parallel for collapse(2))  \
 | 
			
		||||
  for ( uint64_t i1=0;i1<n1;i1++) {	   \
 | 
			
		||||
  for ( uint64_t i2=0;i2<n2;i2++) {	   \
 | 
			
		||||
  { __VA_ARGS__ } ;			   \
 | 
			
		||||
  }}
 | 
			
		||||
#define thread_foreach( i, container, ... )                 DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_for_in_region( i, num, ... )                 DO_PRAGMA(omp for schedule(static))          for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_for_collapse2( i, num, ... )                 DO_PRAGMA(omp parallel for collapse(2))      for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_for_collapse( N , i, num, ... )              DO_PRAGMA(omp parallel for collapse ( N ) )  for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_for_collapse_in_region( N , i, num, ... )    DO_PRAGMA(omp for collapse ( N ))            for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
 | 
			
		||||
#define thread_region                                       DO_PRAGMA(omp parallel)
 | 
			
		||||
#define thread_critical                                     DO_PRAGMA(omp critical)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -52,14 +52,14 @@ public:
 | 
			
		||||
  accelerator_inline size_type size(void) const { return _size; };
 | 
			
		||||
  accelerator_inline void  clear(void) { resize(0);}
 | 
			
		||||
  accelerator_inline void  resize(size_type sz) {
 | 
			
		||||
#ifndef GRID_HIP
 | 
			
		||||
    assert(sz>=0);
 | 
			
		||||
    assert(sz<=MaxEntries);
 | 
			
		||||
#endif
 | 
			
		||||
    _size = sz;
 | 
			
		||||
  }
 | 
			
		||||
  accelerator_inline void  resize(size_type sz,const value &val) {
 | 
			
		||||
    assert(sz>=0);
 | 
			
		||||
    assert(sz<=MaxEntries);
 | 
			
		||||
    _size = sz;
 | 
			
		||||
    resize(sz);
 | 
			
		||||
    for(int s=0;s<sz;s++) _data[s]=val;
 | 
			
		||||
  }
 | 
			
		||||
  accelerator_inline pointer begin(void)                   { return &_data[0]; } 
 | 
			
		||||
@@ -67,7 +67,7 @@ public:
 | 
			
		||||
  accelerator_inline pointer end  (void)                   { return &_data[_size]; } 
 | 
			
		||||
  accelerator_inline const_pointer end  (void) const       { return &_data[_size]; } 
 | 
			
		||||
  accelerator_inline void push_back(const value &val)      { resize(_size+1); _data[_size-1] = val;}
 | 
			
		||||
  accelerator_inline AcceleratorVector()                   { _size = 0; }
 | 
			
		||||
  accelerator_inline AcceleratorVector()                   { resize(0); }
 | 
			
		||||
  accelerator_inline AcceleratorVector(size_type sz)           { resize(sz); }
 | 
			
		||||
  accelerator_inline AcceleratorVector(size_type sz,const value &val) { resize(sz,val); }
 | 
			
		||||
  AcceleratorVector(const std::vector<value> ©me) { 
 | 
			
		||||
 
 | 
			
		||||
@@ -73,8 +73,6 @@ feenableexcept (unsigned int excepts)
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
uint32_t gpu_threads=8;
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
@@ -192,16 +190,12 @@ void GridParseLayout(char **argv,int argc,
 | 
			
		||||
    assert(ompthreads.size()==1);
 | 
			
		||||
    GridThread::SetThreads(ompthreads[0]);
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){
 | 
			
		||||
  if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){
 | 
			
		||||
    std::vector<int> gputhreads(0);
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
    std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was"
 | 
			
		||||
              << " not compiled with GPU support" << std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
    arg= GridCmdOptionPayload(argv,argv+argc,"--gpu-threads");
 | 
			
		||||
    arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads");
 | 
			
		||||
    GridCmdOptionIntVector(arg,gputhreads);
 | 
			
		||||
    assert(gputhreads.size()==1);
 | 
			
		||||
    gpu_threads=gputhreads[0];
 | 
			
		||||
    acceleratorThreads(gputhreads[0]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
 | 
			
		||||
@@ -241,8 +235,6 @@ static int Grid_is_initialised;
 | 
			
		||||
/////////////////////////////////////////////////////////
 | 
			
		||||
void GridBanner(void)
 | 
			
		||||
{
 | 
			
		||||
  static int printed =0;
 | 
			
		||||
  if( !printed ) {
 | 
			
		||||
    std::cout <<std::endl;
 | 
			
		||||
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
 | 
			
		||||
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
 | 
			
		||||
@@ -278,67 +270,6 @@ void GridBanner(void)
 | 
			
		||||
    std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
    std::cout << std::endl;
 | 
			
		||||
    printed=1;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
cudaDeviceProp *gpu_props;
 | 
			
		||||
#endif
 | 
			
		||||
void GridGpuInit(void)
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
  int nDevices = 1;
 | 
			
		||||
  cudaGetDeviceCount(&nDevices);
 | 
			
		||||
  gpu_props = new cudaDeviceProp[nDevices];
 | 
			
		||||
 | 
			
		||||
  char * localRankStr = NULL;
 | 
			
		||||
  int rank = 0, world_rank=0; 
 | 
			
		||||
#define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 | 
			
		||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 | 
			
		||||
#define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK"
 | 
			
		||||
#define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
 | 
			
		||||
  // We extract the local rank initialization using an environment variable
 | 
			
		||||
  if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
 | 
			
		||||
  {
 | 
			
		||||
    rank = atoi(localRankStr);		
 | 
			
		||||
  }
 | 
			
		||||
  if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
 | 
			
		||||
  {
 | 
			
		||||
    rank = atoi(localRankStr);		
 | 
			
		||||
  }
 | 
			
		||||
  if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);}
 | 
			
		||||
  if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
 | 
			
		||||
 | 
			
		||||
  if ( world_rank == 0 ) {
 | 
			
		||||
    GridBanner();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < nDevices; i++) {
 | 
			
		||||
 | 
			
		||||
#define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("GpuInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 | 
			
		||||
#define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
 | 
			
		||||
    
 | 
			
		||||
    cudaGetDeviceProperties(&gpu_props[i], i);
 | 
			
		||||
    if ( world_rank == 0) {
 | 
			
		||||
      cudaDeviceProp prop; 
 | 
			
		||||
      prop = gpu_props[i];
 | 
			
		||||
      printf("GpuInit: ========================\n");
 | 
			
		||||
      printf("GpuInit: Device Number    : %d\n", i);
 | 
			
		||||
      printf("GpuInit: ========================\n");
 | 
			
		||||
      printf("GpuInit: Device identifier: %s\n", prop.name);
 | 
			
		||||
 | 
			
		||||
      GPU_PROP(managedMemory);
 | 
			
		||||
      GPU_PROP(isMultiGpuBoard);
 | 
			
		||||
      GPU_PROP(warpSize);
 | 
			
		||||
      //      GPU_PROP(unifiedAddressing);
 | 
			
		||||
      //      GPU_PROP(l2CacheSize);
 | 
			
		||||
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if ( world_rank == 0 ) {
 | 
			
		||||
    printf("GpuInit: ================================================\n");
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void Grid_init(int *argc,char ***argv)
 | 
			
		||||
@@ -353,9 +284,7 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
  //////////////////////////////////////////////////////////
 | 
			
		||||
  // Early intialisation necessities without rank knowledge
 | 
			
		||||
  //////////////////////////////////////////////////////////
 | 
			
		||||
  GridGpuInit(); // Must come first to set device prior to MPI init
 | 
			
		||||
 | 
			
		||||
  PointerCache::Init();
 | 
			
		||||
  acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
 | 
			
		||||
    int MB;
 | 
			
		||||
@@ -365,6 +294,14 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
    GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--device-mem") ){
 | 
			
		||||
    int MB;
 | 
			
		||||
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--device-mem");
 | 
			
		||||
    GridCmdOptionInt(arg,MB);
 | 
			
		||||
    uint64_t MB64 = MB;
 | 
			
		||||
    MemoryManager::DeviceMaxBytes = MB64*1024LL*1024LL;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--hypercube") ){
 | 
			
		||||
    int enable;
 | 
			
		||||
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--hypercube");
 | 
			
		||||
@@ -381,6 +318,11 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
    Grid_debug_handler_init();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////
 | 
			
		||||
  // Memory manager
 | 
			
		||||
  //////////////////////////////////////////////////////////
 | 
			
		||||
  MemoryManager::Init();
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////
 | 
			
		||||
  // MPI initialisation
 | 
			
		||||
  //////////////////////////////////////////////////////////
 | 
			
		||||
@@ -419,11 +361,18 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "================================================ "<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////
 | 
			
		||||
  // Reporting
 | 
			
		||||
  /////////////////////////////////////////////////////////
 | 
			
		||||
  std::cout << GridLogMessage << "Requested "<< GlobalSharedMemory::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
 | 
			
		||||
  if ( GlobalSharedMemory::Hugepages) {
 | 
			
		||||
    std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#ifndef GRID_UVM
 | 
			
		||||
  std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
 | 
			
		||||
    MemoryProfiler::debug = true;
 | 
			
		||||
 
 | 
			
		||||
@@ -237,9 +237,9 @@ public:
 | 
			
		||||
 | 
			
		||||
      Vec rn ; random(sRNG,rn);
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); z=rn;
 | 
			
		||||
      LatticeVec x(&Grid); x=rn;
 | 
			
		||||
      LatticeVec y(&Grid); y=rn;
 | 
			
		||||
      LatticeVec z(&Grid); z=Zero();
 | 
			
		||||
      LatticeVec x(&Grid); x=Zero();
 | 
			
		||||
      LatticeVec y(&Grid); y=Zero();
 | 
			
		||||
      double a=2.0;
 | 
			
		||||
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
@@ -247,9 +247,9 @@ public:
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
	z=a*x-y;
 | 
			
		||||
	auto x_v = x.View();
 | 
			
		||||
	auto y_v = y.View();
 | 
			
		||||
	auto z_v = z.View();
 | 
			
		||||
	autoView( x_v , x, CpuWrite);
 | 
			
		||||
	autoView( y_v , y, CpuWrite);
 | 
			
		||||
	autoView( z_v , z, CpuRead);
 | 
			
		||||
        x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
 | 
			
		||||
        y_v[4]=z_v[4];
 | 
			
		||||
      }
 | 
			
		||||
 
 | 
			
		||||
@@ -21,7 +21,7 @@
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
#define CUDA_PROFILE
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
@@ -129,8 +129,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  LatticeGaugeField Umu5d(FGrid); 
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,FGrid);
 | 
			
		||||
  {
 | 
			
		||||
    auto Umu5d_v = Umu5d.View();
 | 
			
		||||
    auto Umu_v = Umu.View();
 | 
			
		||||
    autoView( Umu5d_v, Umu5d, CpuWrite);
 | 
			
		||||
    autoView( Umu_v  , Umu  , CpuRead);
 | 
			
		||||
    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
 | 
			
		||||
      for(int s=0;s<Ls;s++){
 | 
			
		||||
	Umu5d_v[Ls*ss+s] = Umu_v[ss];
 | 
			
		||||
@@ -258,8 +258,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
 | 
			
		||||
      tmp = U[mu]*Cshift(src,mu+1,1);
 | 
			
		||||
      {
 | 
			
		||||
	auto ref_v = ref.View();
 | 
			
		||||
	auto tmp_v = tmp.View();
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( tmp_v, tmp, CpuRead);
 | 
			
		||||
	for(int i=0;i<ref_v.size();i++){
 | 
			
		||||
	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
 | 
			
		||||
	}
 | 
			
		||||
@@ -268,8 +268,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
      tmp =adj(U[mu])*src;
 | 
			
		||||
      tmp =Cshift(tmp,mu+1,-1);
 | 
			
		||||
      {
 | 
			
		||||
	auto ref_v = ref.View();
 | 
			
		||||
	auto tmp_v = tmp.View();
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( tmp_v, tmp, CpuRead);
 | 
			
		||||
	for(int i=0;i<ref_v.size();i++){
 | 
			
		||||
	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
 | 
			
		||||
	}
 | 
			
		||||
 
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user