Checking in fixed adaptive WilsonFlow

Update README.md
removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore)
2026-03-10 22:46:13 +00:00 · 2021-06-07 14:20:27 -04:00 · 2021-06-06 04:52:05 -04:00 · 2021-06-04 11:12:22 +01:00 · 2021-06-03 04:24:19 +00:00 · 2021-05-05 14:17:18 -07:00
25 changed files with 334 additions and 284 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,56 +0,0 @@
-language: cpp
-
-cache:
-  directories:
-    - clang
-
-matrix:
-  include:
-    - os:        osx
-      osx_image: xcode8.3
-      compiler: clang
-      
-before_install:
-    - export GRIDDIR=`pwd`
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
-    
-install:
-    - export CWD=`pwd`
-    - echo $CWD
-    - export CC=$CC$VERSION
-    - export CXX=$CXX$VERSION
-    - echo $PATH
-    - which autoconf
-    - autoconf  --version
-    - which automake
-    - automake  --version
-    - which $CC
-    - $CC  --version
-    - which $CXX
-    - $CXX --version
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
-    
-script:
-    - ./bootstrap.sh
-    - mkdir build
-    - cd build
-    - mkdir lime
-    - cd lime
-    - mkdir build
-    - cd build
-    - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
-    - tar xf lime-1.3.2.tar.gz
-    - cd lime-1.3.2
-    - ./configure --prefix=$CWD/build/lime/install
-    - make -j4
-    - make install
-    - cd $CWD/build
-    - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
-    - make -j4 
-    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
-    - make check
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -122,8 +122,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  cshiftVector<vobj> send_buf(buffer_size);
-  cshiftVector<vobj> recv_buf(buffer_size);
+  static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
+  static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
    
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -198,8 +198,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);

-  std::vector<cshiftVector<scalar_object> >  send_buf_extract(Nsimd);
-  std::vector<cshiftVector<scalar_object> >  recv_buf_extract(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
 
@@ -294,8 +294,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  cshiftVector<vobj> send_buf_v(buffer_size);
-  cshiftVector<vobj> recv_buf_v(buffer_size);
+  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
+  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
  vobj *send_buf;
  vobj *recv_buf;
  {
@@ -381,8 +381,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);

-  std::vector<cshiftVector<scalar_object> >  send_buf_extract(Nsimd);
-  std::vector<cshiftVector<scalar_object> >  recv_buf_extract(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
  {
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
  std::time_t t = std::time(nullptr);
  std::tm tm_ = *std::localtime(&t);
  std::ostringstream oss; 
-  //      oss << std::put_time(&tm_, "%c %Z");
+  oss << std::put_time(&tm_, "%c %Z");
  header.creation_date = oss.str();
  header.archive_date  = header.creation_date;

--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -205,11 +205,20 @@ public:
    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
  }

+  // Preferred interface
+  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
+					std::string file, 
+					std::string ens_label = std::string("DWF"))
+  {
+    writeConfiguration(Umu,file,0,1,ens_label);
+  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
-					int bits32)
+					int bits32,
+					std::string ens_label = std::string("DWF"))
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
@@ -219,8 +228,8 @@ public:
    // Following should become arguments
    ///////////////////////////////////////////
    header.sequence_number = 1;
-    header.ensemble_id     = "UKQCD";
-    header.ensemble_label  = "DWF";
+    header.ensemble_id     = std::string("UKQCD");
+    header.ensemble_label  = ens_label;

    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -232,7 +241,7 @@ public:
    GaugeStats Stats; Stats(Umu,header);
    MachineCharacteristics(header);

-	uint64_t offset;
+    uint64_t offset;

    // Sod it -- always write 3x3 double
    header.floating_point = std::string("IEEE64BIG");
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -291,12 +291,6 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

-#ifndef GRID_CUDA
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
-#endif
-
 NAMESPACE_END(Grid);

 ////////////////////
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered);
 /////////////////////////////////////////////////////////////////////////////
 // Single flavour one component spinors with colour index. 5d vec
 /////////////////////////////////////////////////////////////////////////////
-#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
-NAMESPACE_CHECK(ImplStaggered5dVec);  
+// Deprecate Vec5d
+//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
+//NAMESPACE_CHECK(ImplStaggered5dVec);  


--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@@ -72,19 +72,23 @@ public:
    
  StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
      
-  static accelerator_inline void multLink(SiteSpinor &phi,
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
 		       const SiteDoubledGaugeField &U,
-		       const SiteSpinor &chi,
+		       const _Spinor &chi,
 		       int mu)
  {
-    mult(&phi(), &U(mu), &chi());
+    auto UU = coalescedRead(U(mu));
+    mult(&phi(), &UU, &chi());
  }
-  static accelerator_inline void multLinkAdd(SiteSpinor &phi,
+  template<class _Spinor>
+  static accelerator_inline void multLinkAdd(_Spinor &phi,
 			  const SiteDoubledGaugeField &U,
-			  const SiteSpinor &chi,
+			  const _Spinor &chi,
 			  int mu)
  {
-    mac(&phi(), &U(mu), &chi());
+    auto UU = coalescedRead(U(mu));
+    mac(&phi(), &UU, &chi());
  }
      
  template <class ref>
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -184,18 +184,22 @@ public:
      mat = TraceIndex<SpinIndex>(P); 
    }
      
-    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
+    {
      for (int mu = 0; mu < Nd; mu++)
      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
    }

-
-  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
-      
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
+  {
+#undef USE_OLD_INSERT_FORCE    
    int Ls=Btilde.Grid()->_fdimensions[0];
+    autoView( mat_v , mat, AcceleratorWrite);
+#ifdef USE_OLD_INSERT_FORCE    
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
+      const int Nsimd = SiteSpinor::Nsimd();
      autoView( tmp_v , tmp, AcceleratorWrite);
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
@@ -208,6 +212,29 @@ public:
 	});
    }
    PokeIndex<LorentzIndex>(mat,tmp,mu);
+#else
+    {
+      const int Nsimd = SiteSpinor::Nsimd();
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
+	  int sU=sss;
+  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
+  	  ColorMatrixType sum;
+	  zeroit(sum);  
+	  for(int s=0;s<Ls;s++){
+	    int sF = s+Ls*sU;
+  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
+  	      auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
+  	      auto aa = coalescedRead(Atilde_v[sF]()(spn) );
+	      auto op = outerProduct(bb,aa);
+  	      sum = sum + op;
+	    }
+	  }
+  	  coalescedWrite(mat_v[sU](mu)(), sum);
+      });
+    }
+#endif    
  }
 };

--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -880,11 +880,23 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }

  std::vector<RealD> G_s(Ls,1.0);
+  Integer sign = 1; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
    }
  }
+  else if ( curr_type == Current::Tadpole ) {
+    auto b=this->_b;
+    auto c=this->_c;
+    if ( b == 1 && c == 0 ) {
+      sign = -1;    
+    }
+    else {
+      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
+      assert(b==1 && c==0);
+    }
+  }

  for(int s=0;s<Ls;s++){

@@ -907,7 +919,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

    tmp    = Cshift(tmp,mu,1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu);
-    tmp    = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
+    tmp    = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
    tmp    = where((lcoor>=tmin),tmp,zz); // Mask the time 
    L_Q    = where((lcoor<=tmax),tmp,zz); // Position of current complicated

--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
@@ -680,7 +680,8 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
  gauge2 =(uint64_t)&UU[sU]( Z );				\
  gauge3 =(uint64_t)&UU[sU]( T ); 
  
-
+#undef STAG_VEC5D
+#ifdef STAG_VEC5D
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
@@ -790,7 +791,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
 #endif
 }
   
-   
+#endif   


 #define PERMUTE_DIR3 __asm__ (	\
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@@ -32,25 +32,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-#define LOAD_CHI(b)		\
+#ifdef GRID_SIMT
+
+#define LOAD_CHI(ptype,b)			\
+  const SiteSpinor & ref (b[offset]);				\
+  Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane);	\
+  Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane);	\
+  Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
+
+#define LOAD_CHI_COMMS(b)		\
  const SiteSpinor & ref (b[offset]);	\
-    Chi_0=ref()()(0);\
-    Chi_1=ref()()(1);\
-    Chi_2=ref()()(2);
+  Chi_0=coalescedRead(ref()()(0),lane);	\
+  Chi_1=coalescedRead(ref()()(1),lane);	\
+  Chi_2=coalescedRead(ref()()(2),lane);
+
+#define PERMUTE_DIR(dir)	;
+#else
+#define LOAD_CHI(ptype,b)      LOAD_CHI_COMMS(b)
+
+#define LOAD_CHI_COMMS(b)		\
+  const SiteSpinor & ref (b[offset]);	\
+  Chi_0=ref()()(0);			\
+  Chi_1=ref()()(1);			\
+  Chi_2=ref()()(2);
+
+#define PERMUTE_DIR(dir)			\
+  permute##dir(Chi_0,Chi_0);			\
+  permute##dir(Chi_1,Chi_1);			\
+  permute##dir(Chi_2,Chi_2);
+
+#endif


 // To splat or not to splat depends on the implementation
 #define MULT(A,UChi)				\
  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    U_00=coalescedRead(ref()(0,0),lane);				\
+    U_10=coalescedRead(ref()(1,0),lane);				\
+    U_20=coalescedRead(ref()(2,0),lane);				\
+    U_01=coalescedRead(ref()(0,1),lane);				\
+    U_11=coalescedRead(ref()(1,1),lane);				\
+    U_21=coalescedRead(ref()(2,1),lane);				\
+    U_02=coalescedRead(ref()(0,2),lane);				\
+    U_12=coalescedRead(ref()(1,2),lane);				\
+    U_22=coalescedRead(ref()(2,2),lane);				\
    UChi ## _0  = U_00*Chi_0;	       \
    UChi ## _1  = U_10*Chi_0;\
    UChi ## _2  = U_20*Chi_0;\
@@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid);

 #define MULT_ADD(U,A,UChi)			\
  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    U_00=coalescedRead(ref()(0,0),lane);				\
+    U_10=coalescedRead(ref()(1,0),lane);				\
+    U_20=coalescedRead(ref()(2,0),lane);				\
+    U_01=coalescedRead(ref()(0,1),lane);				\
+    U_11=coalescedRead(ref()(1,1),lane);				\
+    U_21=coalescedRead(ref()(2,1),lane);				\
+    U_02=coalescedRead(ref()(0,2),lane);				\
+    U_12=coalescedRead(ref()(1,2),lane);				\
+    U_22=coalescedRead(ref()(2,2),lane);				\
    UChi ## _0 += U_00*Chi_0;	       \
    UChi ## _1 += U_10*Chi_0;\
    UChi ## _2 += U_20*Chi_0;\
@@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid);
    UChi ## _2 += U_22*Chi_2;


-#define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_0,Chi_0);			\
-  permute##dir(Chi_1,Chi_1);			\
-  permute##dir(Chi_2,Chi_2);
-
-
 #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);	\
  offset = SE->_offset;			\
  local  = SE->_is_local;		\
  perm   = SE->_permute;		\
  if ( local ) {						\
-    LOAD_CHI(in);					\
+    LOAD_CHI(Perm,in);						\
    if ( perm) {						\
      PERMUTE_DIR(Perm);					\
    }								\
  } else {							\
-    LOAD_CHI(buf);						\
+    LOAD_CHI_COMMS(buf);					\
  }								

 #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
@@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid);
  }


-
 #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);			\
  offset = SE->_offset;					\
  local  = SE->_is_local;				\
  perm   = SE->_permute;				\
  if ( local ) {					\
-    LOAD_CHI(in);				\
+    LOAD_CHI(Perm,in);					\
    if ( perm) {					\
      PERMUTE_DIR(Perm);				\
    }							\
  } else if ( st.same_node[Dir] ) {			\
-    LOAD_CHI(buf);					\
+    LOAD_CHI_COMMS(buf);				\
  }							\
  if (local || st.same_node[Dir] ) {		\
    MULT_ADD(U,Dir,even);				\
@@ -140,10 +158,32 @@ NAMESPACE_BEGIN(Grid);
  local  = SE->_is_local;				\
  if ((!local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    { LOAD_CHI(buf);	  }					\
+    { LOAD_CHI_COMMS(buf);	  }				\
    { MULT_ADD(U,Dir,even); }					\
  }								

+#define HAND_DECLARATIONS(Simd) \
+  Simd even_0;			\
+  Simd even_1;			\
+  Simd even_2;			\
+  Simd odd_0;			\
+  Simd odd_1;			\
+  Simd odd_2;		        \
+		      		\
+  Simd Chi_0;			\
+  Simd Chi_1;			\
+  Simd Chi_2;			\
+				\
+  Simd U_00;			\
+  Simd U_10;			\
+  Simd U_20;			\
+  Simd U_01;			\
+  Simd U_11;			\
+  Simd U_21;			\
+  Simd U_02;			\
+  Simd U_12;			\
+  Simd U_22;			
+  

 template <class Impl>
 template <int Naik> accelerator_inline
@@ -155,28 +195,14 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset,local,perm, ptype;

  StencilEntry *SE;
@@ -215,7 +241,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out[sF],result);
+    coalescedWrite(out[sF],result);
  }
 }

@@ -230,28 +256,13 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset, ptype, local, perm;

  StencilEntry *SE;
@@ -261,8 +272,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
  //    int sF=s+LLs*sU;
  {

-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
+    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);

    skew = 0;
    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
@@ -294,7 +305,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out[sF],result);
+    coalescedWrite(out[sF],result);
  }
 }

@@ -309,28 +320,13 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset, ptype, local;

  StencilEntry *SE;
@@ -340,8 +336,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  //    int sF=s+LLs*sU;
  {

-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
+    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);
    int nmu=0;
    skew = 0;
    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
@@ -374,7 +370,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 	result()()(1) = even_1 + odd_1;
 	result()()(2) = even_2 + odd_2;
      }
-      out[sF] = out[sF] + result;
+      coalescedWrite(out[sF] , out(sF)+ result);
    }
  }
 }
@@ -397,6 +393,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 */
 #undef LOAD_CHI
+#undef HAND_DECLARATIONS

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -35,39 +35,32 @@ NAMESPACE_BEGIN(Grid);
 #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);			\
-    } else {							\
-      chi_p = &in[SE->_offset];					\
-    }								\
+    int perm= SE->_permute;						\
+    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
  } else {							\
-    chi_p = &buf[SE->_offset];					\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  multLink(Uchi, U[sU], *chi_p, Dir);			
+  acceleratorSynchronise();					\
+  multLink(Uchi, U[sU], chi, Dir);			

 #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);			\
-    } else {							\
-      chi_p = &in[SE->_offset];					\
-    }								\
+    int perm= SE->_permute;						\
+    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
  } else if ( st.same_node[Dir] ) {				\
-    chi_p = &buf[SE->_offset];					\
+    chi = coalescedRead(buf[SE->_offset],lane);                 \
  }								\
  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U[sU], *chi_p, Dir);				\
+    multLink(Uchi, U[sU], chi, Dir);				\
  }

 #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U[sU], *chi_p, Dir);				\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
+    multLink(Uchi, U[sU], chi, Dir);				\
  }

 template <class Impl>
@@ -84,12 +77,14 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
 					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);

  //  for(int s=0;s<LLs;s++){
  //
@@ -118,7 +113,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
    if ( dag ) { 
      Uchi = - Uchi;
    } 
-    vstream(out[sF], Uchi);
+    coalescedWrite(out[sF], Uchi,lane);
  }
 };

@@ -130,13 +125,16 @@ template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU, 
-						const FermionFieldView &in, FermionFieldView &out,int dag) {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
+						const FermionFieldView &in, FermionFieldView &out,int dag)
+{
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew ;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);

  //  for(int s=0;s<LLs;s++){
  //    int sF=LLs*sU+s;
@@ -165,7 +163,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
    if ( dag ) {
      Uchi = - Uchi;
    }
-    vstream(out[sF], Uchi);
+    coalescedWrite(out[sF], Uchi,lane);
  }
 };

@@ -178,14 +176,17 @@ template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU,
-						const FermionFieldView &in, FermionFieldView &out,int dag) {
-  const SiteSpinor *chi_p;
-  //  SiteSpinor chi;
-  SiteSpinor Uchi;
+						const FermionFieldView &in, FermionFieldView &out,int dag)
+{
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int nmu=0;
  int skew ;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);

  //  for(int s=0;s<LLs;s++){
  //    int sF=LLs*sU+s;
@@ -211,11 +212,12 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
    }
-    if ( nmu ) { 
-      if ( dag ) { 
-	out[sF] = out[sF] - Uchi;
+    if ( nmu ) {
+      auto _out = coalescedRead(out[sF],lane);
+      if ( dag ) {
+	coalescedWrite(out[sF], _out-Uchi,lane);
      } else { 
-	out[sF] = out[sF] + Uchi;
+	coalescedWrite(out[sF], _out+Uchi,lane);
      }
    }
  }
@@ -261,6 +263,8 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
  autoView( UUU_v , UUU, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
@@ -301,6 +305,8 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
  autoView( UUU_v ,   U, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@@ -85,21 +85,18 @@ public:

    std::cout << GridLogDebug << "Stout smearing started\n";

-    // Smear the configurations
+    // C contains the staples multiplied by some rho
+    u_smr = U ; // set the smeared field to the current gauge field
    SmearBase->smear(C, U);

    for (int mu = 0; mu < Nd; mu++) {
-      if( mu == OrthogDim )
-        tmp = 1.0;  // Don't smear in the orthogonal direction
-      else {
-        tmp = peekLorentz(C, mu);
-        Umu = peekLorentz(U, mu);
-        iq_mu = Ta(
-                   tmp *
-                   adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
-        exponentiate_iQ(tmp, iq_mu);
-      }
-      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
+      if( mu == OrthogDim ) continue ;
+      // u_smr = exp(iQ_mu)*U_mu apart from Orthogdim
+      Umu = peekLorentz(U, mu);
+      tmp = peekLorentz(C, mu);
+      iq_mu = Ta( tmp * adj(Umu));  
+      exponentiate_iQ(tmp, iq_mu);
+      pokeLorentz(u_smr, tmp * Umu, mu);
    }
    std::cout << GridLogDebug << "Stout smearing completed\n";
  };
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -6,7 +6,8 @@ Source file: ./lib/qcd/modules/plaquette.h

 Copyright (C) 2017

-Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Guido Cossu  <guido.cossu@ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -35,7 +36,7 @@ template <class Gimpl>
 class WilsonFlow: public Smear<Gimpl>{
  unsigned int Nstep;
  unsigned int measure_interval;
-  mutable RealD epsilon, taus;
+  mutable RealD epsilon, taus,tolerance;


  mutable WilsonGaugeAction<Gimpl> SG;
@@ -47,13 +48,15 @@ class WilsonFlow: public Smear<Gimpl>{
 public:
  INHERIT_GIMPL_TYPES(Gimpl)

-  explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
+  explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1, RealD tol = 1e-3):
  Nstep(Nstep),
    epsilon(epsilon),
+    tolerance(tol),
    measure_interval(interval),
    SG(WilsonGaugeAction<Gimpl>(3.0)) {
    // WilsonGaugeAction with beta 3.0
    assert(epsilon > 0.0);
+    assert(tolerance > 0.0);
    LogMessage();
  }

@@ -64,6 +67,8 @@ public:
 	      << "[WilsonFlow] epsilon : " << epsilon << std::endl;
    std::cout << GridLogMessage
 	      << "[WilsonFlow] full trajectory : " << Nstep * epsilon << std::endl;
+    std::cout << GridLogMessage
+	      << "[WilsonFlow] tolerance : " << tolerance << std::endl;
  }

  virtual void smear(GaugeField&, const GaugeField&) const;
@@ -106,11 +111,14 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
  if (maxTau - taus < epsilon){
    epsilon = maxTau-taus;
  }
-  //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
+  std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
  GaugeField Z(U.Grid());
  GaugeField Zprime(U.Grid());
-  GaugeField tmp(U.Grid()), Uprime(U.Grid());
+  GaugeField tmp(U.Grid()), Uprime(U.Grid()),Usave(U.Grid());
+
  Uprime = U;
+  Usave = U;
+
  SG.deriv(U, Z);
  Zprime = -Z;
  Z *= 0.25;                                  // Z0 = 1/4 * F(U)
@@ -128,18 +136,33 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
  Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
  Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2

-  // Ramos 
+  // Ramos arXiv:1301.4388
  Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0
  // Compute distance as norm^2 of the difference
  GaugeField diffU = U - Uprime;
-  RealD diff = norm2(diffU);
-  // adjust integration step
+// Wrong
+//  RealD diff = norm2(diffU);
+//  std::cout << GridLogMessage << "norm2: " << diff << std::endl;
+  
+//  RealD tol=1e-3;
+  
+  RealD diff = real(rankInnerMax(diffU,diffU));
+  diff = sqrt(diff)/18.; // distance defined in Ramos 
+
+  GridBase *grid = diffU.Grid();
+  std::cout << GridLogMessage << "max: " << diff << std::endl;
+  grid->GlobalMax(diff);
+  std::cout << GridLogMessage << "max: " << diff << std::endl;
    
+  if(diff < tolerance) {
  taus += epsilon;
-  //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
+//  std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
+  } else {
+    U = Usave;
+  }
    
-  epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
-  //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
+  epsilon = epsilon*0.95*std::pow(tolerance/diff,1./3.);
+  std::cout << GridLogMessage << "Distance : "<<diff<<"New epsilon : " << epsilon << std::endl;

 }

@@ -184,8 +207,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
 template <class Gimpl>
 void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){
  out = in;
-  taus = epsilon;
+//  taus = epsilon;
+  taus = 0.;
  unsigned int step = 0;
+  double measTau = epsilon*measure_interval;
+  std::cout << GridLogMessage << "measTau :"<< measTau << std::endl;
  do{
    step++;
    //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
@@ -193,10 +219,12 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
    std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
 		  << step << "  " << taus << "  "
 	      << energyDensityPlaquette(out) << std::endl;
-    if( step % measure_interval == 0){
+//    if( step % measure_interval == 0){
+    if( taus >  measTau ) {
      std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
 		<< step << "  " 
 		<< WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
+      measTau += epsilon*measure_interval;
    }
  } while (taus < maxTau);

--- a/Grid/tensors/Tensor_SIMT.h
+++ b/Grid/tensors/Tensor_SIMT.h
@@ -65,7 +65,8 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 #else


-#ifndef GRID_SYCL
+//#ifndef GRID_SYCL
+#if 1
 // Use the scalar as our own complex on GPU ... thrust::complex or std::complex
 template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
 typename vsimd::scalar_type
--- a/Grid/tensors/Tensor_outer.h
+++ b/Grid/tensors/Tensor_outer.h
@@ -34,6 +34,16 @@ NAMESPACE_BEGIN(Grid);
 // outerProduct Scalar x Scalar -> Scalar
 //              Vector x Vector -> Matrix
 ///////////////////////////////////////////////////////////////////////////////////////
+template<class CC,IfComplex<CC> = 0>
+accelerator_inline CC outerProduct(const CC &l, const CC& r)
+{
+  return l*conj(r);
+}
+template<class RR,IfReal<RR> = 0>
+accelerator_inline RR outerProduct(const RR &l, const RR& r)
+{
+  return l*r;
+}

 template<class l,class r,int N> accelerator_inline
 auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
@@ -57,17 +67,6 @@ auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<declt
  return ret;
 }

-template<class CC,IfComplex<CC> = 0>
-accelerator_inline CC outerProduct(const CC &l, const CC& r)
-{
-  return l*conj(r);
-}
-template<class RR,IfReal<RR> = 0>
-accelerator_inline RR outerProduct(const RR &l, const RR& r)
-{
-  return l*r;
-}
-
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -457,7 +457,7 @@ accelerator_inline void acceleratorSynchronise(void)
  __syncwarp();
 #endif
 #ifdef GRID_SYCL
-  cl::sycl::detail::workGroupBarrier();
+  //cl::sycl::detail::workGroupBarrier();
 #endif
 #ifdef GRID_HIP
  __syncthreads();
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) 

 **Data parallel C++ mathematical object library.**

--- a/tests/debug/Test_heatbath_dwf_eofa.cc
+++ b/tests/debug/Test_heatbath_dwf_eofa.cc
@@ -66,7 +66,9 @@ int main(int argc, char** argv)
  // Set up RNGs
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
+  GridSerialRNG sRNG;
  GridParallelRNG RNG5(FGrid);
+  sRNG.SeedFixedIntegers(seeds5);
  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);
@@ -84,7 +86,7 @@ int main(int argc, char** argv)
    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, false);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -94,7 +96,7 @@ int main(int argc, char** argv)
    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, true);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

--- a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc
+++ b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc
@@ -74,6 +74,9 @@ int main(int argc, char** argv)
  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);
+  GridSerialRNG sRNG;
+  RNG4.SeedFixedIntegers(seeds4);
+  sRNG.SeedFixedIntegers(seeds5);

  // Random gauge field
  LatticeGaugeField Umu(UGrid);
@@ -90,7 +93,7 @@ int main(int argc, char** argv)
    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -100,7 +103,7 @@ int main(int argc, char** argv)
    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, true);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

--- a/tests/debug/Test_heatbath_mobius_eofa.cc
+++ b/tests/debug/Test_heatbath_mobius_eofa.cc
@@ -68,8 +68,10 @@ int main(int argc, char** argv)
  // Set up RNGs
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
+  GridSerialRNG sRNG;
  GridParallelRNG RNG5(FGrid);
  RNG5.SeedFixedIntegers(seeds5);
+  sRNG.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);

@@ -86,7 +88,7 @@ int main(int argc, char** argv)
    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, false);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu, sRNG,RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -96,7 +98,7 @@ int main(int argc, char** argv)
    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, true);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu, sRNG,RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

--- a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc
+++ b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc
@@ -73,7 +73,9 @@ int main(int argc, char** argv)
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
  GridParallelRNG RNG5(FGrid);
+  GridSerialRNG   sRNG;
  RNG5.SeedFixedIntegers(seeds5);
+  sRNG.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);

@@ -91,7 +93,7 @@ int main(int argc, char** argv)
    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu, sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -101,7 +103,7 @@ int main(int argc, char** argv)
    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, true);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu, sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

--- a/tests/forces/Test_gp_rect_force.cc
+++ b/tests/forces/Test_gp_rect_force.cc
@@ -29,7 +29,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 using namespace std;
 using namespace Grid;
- ;

 

@@ -59,6 +58,10 @@ int main (int argc, char ** argv)
  double beta = 1.0;
  double c1   = 0.331;

+  const int nu = 1;
+  std::vector<int> twists(Nd,0);
+  twists[nu] = 1;
+  ConjugateGimplD::setDirections(twists);
  ConjugatePlaqPlusRectangleActionR Action(beta,c1);
  //ConjugateWilsonGaugeActionR Action(beta);
  //WilsonGaugeActionR Action(beta);
--- a/tests/forces/Test_momentum_filter.cc
+++ b/tests/forces/Test_momentum_filter.cc
@@ -61,7 +61,9 @@ int main (int argc, char ** argv)
  std::vector<int> seeds({1,2,3,4});

  GridParallelRNG          pRNG(&Grid);
+  GridSerialRNG            sRNG;
  pRNG.SeedFixedIntegers(seeds);
+  sRNG.SeedFixedIntegers(seeds);

  typedef PeriodicGimplR Gimpl;
  typedef WilsonGaugeAction<Gimpl> GaugeAction;
@@ -115,7 +117,7 @@ int main (int argc, char ** argv)
  
  integrator.setMomentumFilter(filter);

-  integrator.refresh(U, pRNG); //doesn't actually change the gauge field
+  integrator.refresh(U, sRNG, pRNG); //doesn't actually change the gauge field

  //Check the momentum is zero on the boundary
  const auto &P = integrator.getMomentum();
--- a/tests/smearing/Test_WilsonFlow.cc
+++ b/tests/smearing/Test_WilsonFlow.cc
@@ -33,6 +33,7 @@ namespace Grid{
    GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters,
            int, steps,
            double, step_size,
+            double, tol,
            int, meas_interval,
            double, maxTau); // for the adaptive algorithm
       
@@ -82,13 +83,27 @@ int main(int argc, char **argv) {
  SU<Nc>::HotConfiguration(pRNG, Umu);
  
  typedef Grid::XmlReader       Serialiser;
-  Serialiser Reader("input.xml");
-  WFParameters WFPar(Reader);
-  ConfParameters CPar(Reader);
-  CheckpointerParameters CPPar(CPar.conf_prefix, CPar.rng_prefix);
+//  Serialiser Reader("input.xml");
+//  WFParameters WFPar(Reader);
+//  ConfParameters CPar(Reader);
+//  WFParameters WFPar;
+  int steps = 800;
+  double step_size=0.02;
+  double tol=1e-4;
+  int meas_interval=50;
+  double maxTau = 16;
+//  ConfParameters CPar;
+//  CPar. conf_prefix="configurations/ckpoint_lat";
+//  CPar. rng_prefix="rngs/ckpoint_rng";
+//  CPar. StartConfiguration=100,
+//  CPar. EndConfiguration=110,
+//  CPar. Skip=1;
+//  CheckpointerParameters CPPar(CPar.conf_prefix, CPar.rng_prefix);
+  CheckpointerParameters CPPar("configurations/ckpoint_lat","rngs/ckpoint_rng");
  BinaryHmcCheckpointer<PeriodicGimplR> CPBin(CPPar);

-  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
+//  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
+  for (int conf = 100; conf <= 110; conf+= 1){

  CPBin.CheckpointRestore(conf, Umu, sRNG, pRNG);

@@ -96,9 +111,10 @@ int main(int argc, char **argv) {
  std::cout << GridLogMessage << "Initial plaquette: "
    << WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;

-  WilsonFlow<PeriodicGimplR> WF(WFPar.steps, WFPar.step_size, WFPar.meas_interval);
+  WilsonFlow<PeriodicGimplR> WF(steps, step_size, meas_interval);

-  WF.smear_adaptive(Uflow, Umu, WFPar.maxTau);
+//  WF.smear_adaptive(Uflow, Umu, maxTau);
+  WF.smear(Uflow, Umu);

  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
Author	SHA1	Message	Date
Chulwoo Jung	b284d50863	Checking in fixed adaptive WilsonFlow	2021-06-07 14:20:27 -04:00
Peter Boyle	92def28bd3	Update README.md	2021-06-06 04:52:05 -04:00
Antonin Portelli	ca10bfa1c7	removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore)	2021-06-04 11:12:22 +01:00
Peter Boyle	0e27e3847d	Remove synch	2021-06-03 04:24:19 +00:00
u61464	8cfc7342cd	staggered hand unroll read coalesce	2021-05-05 14:17:18 -07:00
u61464	15ae317858	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-05-04 08:40:38 -07:00
u61464	834f536b5f	Fastest option on SyCL is now std::complex	2021-05-04 08:40:18 -07:00
Peter Boyle	c332d9f08b	Merge pull request #356 from felixerben/bugfix/stoutSmearing Jamie's fix	2021-04-27 14:10:49 -04:00
Felix Erben	cf2923d5dd	Jamie's fix	2021-04-27 16:53:37 +01:00
Peter Boyle	0e4413ddde	Merge pull request #355 from felixerben/bugfix/stoutSmearing bugfix 3D stout smearing	2021-04-27 08:01:55 -04:00
Felix Erben	009ccd581e	bugfix 3D stout smearing	2021-04-26 10:36:33 +01:00
Peter Boyle	8cd4263974	Tests compile	2021-04-25 22:20:37 -04:00
Peter Boyle	d45c868656	Change interface	2021-04-25 10:53:34 -04:00
Peter Boyle	955a8113de	Expose label only to reduce number of parameters	2021-04-25 10:36:38 -04:00
Peter Boyle	dbe210dd53	Open the ens_id	2021-04-25 10:25:59 -04:00
Peter Boyle	86e11743ca	set twists	2021-04-20 10:19:11 -04:00
Peter Boyle	980e721f6e	Update MetaData.h	2021-04-13 09:33:01 -04:00
Peter Boyle	e2a0142d87	Merge pull request #348 from AndrewYongZhenNing/develop Conserved Tadpole Implementation for Shamir Action Only	2021-04-06 10:49:00 -04:00
Andrew Zhen Ning Yong	895244ecc3	Merge with upstream; implemented conserved tadpole for Shamir action.	2021-04-06 13:46:33 +01:00
Andrew Zhen Ning Yong	addeb621a7	Implemented tadpole operator for Shamir action.	2021-04-06 13:45:37 +01:00
Peter Boyle	a7fb25adf6	Make Cshift fields static to avoid repeated reallocaate overhead	2021-03-29 21:44:14 +02:00
Peter Boyle	e947992957	Improved force terms	2021-03-29 20:04:06 +02:00
Peter Boyle	bb89a82a07	Staggered coalseced read	2021-03-29 20:01:15 +02:00