Merge branch 'develop' into feature/hirep

2026-02-18 21:00:53 +00:00 · 2016-09-13 10:01:51 +01:00
parent 0fd179fb33 5df5d52d41
commit b9c80318a2
5 changed files with 291 additions and 304 deletions
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -4,7 +4,7 @@ EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
 FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
 echo "-- deploying Eigen source..."
-wget ${EIGEN_URL}
+wget ${EIGEN_URL} --no-check-certificate
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
 rm `basename ${EIGEN_URL}`
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -194,22 +194,22 @@ class BinaryIO {
      std::vector<int> site({x,y,z,t});
-      if ( grid->IsBoss() ) {
+      if (grid->IsBoss()) {
-	fin.read((char *)&file_object,sizeof(file_object));
+        fin.read((char *)&file_object, sizeof(file_object));
-	bytes += sizeof(file_object);
+        bytes += sizeof(file_object);
-	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
-	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object));
-	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
-	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object));
-	munge(file_object,munged,csum);
+        munge(file_object, munged, csum);
      }
      // The boss who read the file has their value poked
      pokeSite(munged,Umu,site);
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@@ -254,20 +254,20 @@ class BinaryIO {
      if ( grid->IsBoss() ) {
-	
+  
-	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
+  if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
+  if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
+  if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
+  if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
-	// NB could gather an xstrip as an optimisation.
+  // NB could gather an xstrip as an optimisation.
-	fout.write((char *)&file_object,sizeof(file_object));
+  fout.write((char *)&file_object,sizeof(file_object));
-	bytes+=sizeof(file_object);
+  bytes+=sizeof(file_object);
      }
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@@ -305,15 +305,15 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);
      if( rank == grid->ThisRank() ){
-	//	std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
+  //  std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
-	parallel.GetState(saved,l_idx);
+  parallel.GetState(saved,l_idx);
      }
      grid->Broadcast(rank,(void *)&saved[0],bytes);
      if ( grid->IsBoss() ) {
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-	fout.write((char *)&saved[0],bytes);
+  fout.write((char *)&saved[0],bytes);
      }
    }
@@ -355,14 +355,14 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);
      if ( grid->IsBoss() ) {
-	fin.read((char *)&saved[0],bytes);
+  fin.read((char *)&saved[0],bytes);
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
      }
      grid->Broadcast(0,(void *)&saved[0],bytes);
      if( rank == grid->ThisRank() ){
-	parallel.SetState(saved,l_idx);
+  parallel.SetState(saved,l_idx);
      }
    }
@@ -415,15 +415,15 @@ class BinaryIO {
      if ( d == 0 ) parallel[d] = 0;
      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
+  range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
+  start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  ioproc[d]= grid->_processor_coor[d];
      } else {
-	range[d] = grid->_gdimensions[d];
+  range[d] = grid->_gdimensions[d];
-	start[d] = 0;
+  start[d] = 0;
-	ioproc[d]= 0;
+  ioproc[d]= 0;
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }
      slice_vol = slice_vol * range[d];
    }
@@ -434,9 +434,9 @@ class BinaryIO {
      std::cout<< std::dec ;
      std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
+  std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
+  if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+    std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@@ -463,7 +463,7 @@ class BinaryIO {
      // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@@ -472,8 +472,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);
      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  gsite[d] = tsite[d]+start[d];               // global site
      }
      /////////////////////////
@@ -487,29 +487,29 @@ class BinaryIO {
      // iorank reads from the seek
      ////////////////////////////////
      if (myrank == iorank) {
-	
+  
-	fin.seekg(offset+g_idx*sizeof(fileObj));
+  fin.seekg(offset+g_idx*sizeof(fileObj));
-	fin.read((char *)&fileObj,sizeof(fileObj));
+  fin.read((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
+  bytes+=sizeof(fileObj);
-	
+  
-	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
-	
+  
-	munge(fileObj,siteObj,csum);
+  munge(fileObj,siteObj,csum);
-      }	
+      } 
      // Possibly do transport through pt2pt 
      if ( rank != iorank ) { 
-	if ( (myrank == rank) || (myrank==iorank) ) {
+  if ( (myrank == rank) || (myrank==iorank) ) {
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
-	}
+  }
      }
      // Poke at destination
      if ( myrank == rank ) {
-	  pokeLocalSite(siteObj,Umu,lsite);
+    pokeLocalSite(siteObj,Umu,lsite);
      }
      grid->Barrier(); // necessary?
    }
@@ -520,7 +520,7 @@ class BinaryIO {
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@@ -558,15 +558,15 @@ class BinaryIO {
      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
+  range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
+  start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  ioproc[d]= grid->_processor_coor[d];
      } else {
-	range[d] = grid->_gdimensions[d];
+  range[d] = grid->_gdimensions[d];
-	start[d] = 0;
+  start[d] = 0;
-	ioproc[d]= 0;
+  ioproc[d]= 0;
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }
      slice_vol = slice_vol * range[d];
@@ -577,9 +577,9 @@ class BinaryIO {
      grid->GlobalSum(tmp);
      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
+  std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
+  if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+    std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@@ -610,7 +610,7 @@ class BinaryIO {
    // should aggregate a whole chunk and then write.
    // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@@ -619,8 +619,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);
      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  gsite[d] = tsite[d]+start[d];               // global site
      }
@@ -640,26 +640,26 @@ class BinaryIO {
      // Pair of nodes may need to do pt2pt send
      if ( rank != iorank ) { // comms is necessary
-	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
+  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
-	  // Send to IOrank 
+    // Send to IOrank 
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
-	}
+  }
      }
      grid->Barrier(); // necessary?
      if (myrank == iorank) {
-	
+  
-	munge(siteObj,fileObj,csum);
+  munge(siteObj,fileObj,csum);
-	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
-	
+  
-	fout.seekp(offset+g_idx*sizeof(fileObj));
+  fout.seekp(offset+g_idx*sizeof(fileObj));
-	fout.write((char *)&fileObj,sizeof(fileObj));
+  fout.write((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
+  bytes+=sizeof(fileObj);
      }
    }
@@ -668,7 +668,7 @@ class BinaryIO {
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -55,11 +55,14 @@ namespace QCD {
    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
    // Index conventions:                            Lorentz x Spin x Colour
    // note: static const int or constexpr will work for type deductions
    //       with the intel compiler (up to version 17)
    //////////////////////////////////////////////////////////////////////////////
-    static const int ColourIndex = 2;
+    #define ColourIndex  2
-    static const int SpinIndex   = 1;
+    #define SpinIndex    1
-    static const int LorentzIndex= 0;
+    #define LorentzIndex 0
    // Also should make these a named enum type
    static const int DaggerNo=0;
    static const int DaggerYes=1;
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -49,154 +49,171 @@ namespace Grid {
    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
    public:
-     INHERIT_IMPL_TYPES(Impl);
+      INHERIT_IMPL_TYPES(Impl);
-     typedef FermionOperator<Impl> Base;
+      typedef FermionOperator<Impl> Base;
    public:
-  template <bool EnableBool = true>
+      template <bool EnableBool = true>
-  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
+      typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
-  DiracOptDhopSite(
+	DiracOptDhopSite(
-      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+			 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
-      int sF, int sU, int Ls, int Ns, const FermionField &in,
+			 int sF, int sU, int Ls, int Ns, const FermionField &in,
-      FermionField &out) {
+			 FermionField &out) {
 #ifdef AVX512
-    if (AsmOpt) {
+	if (AsmOpt) {
-      WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns,
+	  WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns,
-                                               in, out);
+						   in, out);
-    } else {
+	} else {
 #else
-    {
+	  {
 #endif
-      for (int site = 0; site < Ns; site++) {
+	    for (int site = 0; site < Ns; site++) {
-        for (int s = 0; s < Ls; s++) {
+	      for (int s = 0; s < Ls; s++) {
-          if (HandOpt)
+		if (HandOpt)
-            WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU,
+		  WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU,
-                                                      in, out);
+							    in, out);
-          else
+		else
-            WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
+		  WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
-                                                         in, out);
+							       in, out);
-          sF++;
+		sF++;
-        }
+	      }
-        sU++;
+	      sU++;
-      }
+	    }
-    }
+	  }
-  }
+	}
-  template <bool EnableBool = true>
+	template <bool EnableBool = true>
-    typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
+	  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
-  DiracOptDhopSite(
+	  DiracOptDhopSite(
-      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+			   StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			   std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
-      int sF, int sU, int Ls, int Ns, const FermionField &in,
+			   int sF, int sU, int Ls, int Ns, const FermionField &in,
-      FermionField &out) {
+			   FermionField &out) {
-    for (int site = 0; site < Ns; site++) {
+	  for (int site = 0; site < Ns; site++) {
-      for (int s = 0; s < Ls; s++) {
+	    for (int s = 0; s < Ls; s++) {
-        WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
+	      WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
-                                                     out);
+							   out);
-        sF++;
+	      sF++;
-      }
+	    }
-      sU++;
+	    sU++;
-    }
+	  }
-  }
+	}
-  template <bool EnableBool = true>
+	template <bool EnableBool = true>
-  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
+	  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
-                          void>::type
+				  void>::type
-  DiracOptDhopSiteDag(
+	  DiracOptDhopSiteDag(
-      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+			      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
-      int sF, int sU, int Ls, int Ns, const FermionField &in,
+			      int sF, int sU, int Ls, int Ns, const FermionField &in,
-      FermionField &out) {
+			      FermionField &out) {
 #ifdef AVX512
-    if (AsmOpt) {
+				    if (AsmOpt) {
-      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
+				      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
-                                                  Ns, in, out);
+										  Ns, in, out);
-    } else {
+				    } else {
 #else
-    {
+				      {
 #endif
-      for (int site = 0; site < Ns; site++) {
+					for (int site = 0; site < Ns; site++) {
-        for (int s = 0; s < Ls; s++) {
+					  for (int s = 0; s < Ls; s++) {
-          if (HandOpt)
+					    if (HandOpt)
-            WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
+					      WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
-                                                         in, out);
+											   in, out);
-          else
+					    else
-            WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
+					      WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
-                                                            sU, in, out);
+											      sU, in, out);
-          sF++;
+					    sF++;
-        }
+					  }
-        sU++;
+					  sU++;
 					}
 				      }
 				    }
 				    template <bool EnableBool = true>
 				      typename std::enable_if<
 				      (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
 				      void>::type
 				      DiracOptDhopSiteDag(
 							  StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 							  std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 							  int sF, int sU, int Ls, int Ns, const FermionField &in,
 							  FermionField &out) {
 					for (int site = 0; site < Ns; site++) {
 					  for (int s = 0; s < Ls; s++) {
 					    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
 											    in, out);
 					    sF++;
 					  }
 					  sU++;
 					}
 				      }
 				    void DiracOptDhopDir(
 							 StencilImpl &st, DoubledGaugeField &U,
 							 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 							 int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
 							 int gamma);
 	private:
 				    // Specialised variants
 				    void DiracOptGenericDhopSite(
 								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 								 int sF, int sU, const FermionField &in, FermionField &out);
 				    void DiracOptGenericDhopSiteDag(
 								    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 								    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 								    int sF, int sU, const FermionField &in, FermionField &out);
 				    void DiracOptAsmDhopSite(
 							     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 							     int sF, int sU, int Ls, int Ns, const FermionField &in,
 							     FermionField &out);
 				    void DiracOptAsmDhopSiteDag(
 								StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 								int sF, int sU, int Ls, int Ns, const FermionField &in,
 								FermionField &out);
 				    void DiracOptHandDhopSite(
 							      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 							      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 							      int sF, int sU, const FermionField &in, FermionField &out);
 				    void DiracOptHandDhopSiteDag(
 								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 								 int sF, int sU, const FermionField &in, FermionField &out);
 	public:
 				    WilsonKernels(const ImplParams &p = ImplParams());
 				  };
 	///////////////////////////////////////////////////////////
 	// Default to no assembler implementation
 	///////////////////////////////////////////////////////////
 	template<class Impl>
 	  void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 	{
 	  assert(0);
 	}
 	template<class Impl>
 	  void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 	{
 	  assert(0);
 	}
      }
    }
  }
  template <bool EnableBool = true>
  typename std::enable_if<
      (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
      void>::type
  DiracOptDhopSiteDag(
      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
      int sF, int sU, int Ls, int Ns, const FermionField &in,
      FermionField &out) {
    for (int site = 0; site < Ns; site++) {
      for (int s = 0; s < Ls; s++) {
        WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
                                                        in, out);
        sF++;
      }
      sU++;
    }
  }
  void DiracOptDhopDir(
      StencilImpl &st, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
      int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
      int gamma);
 private:
  // Specialised variants
  void DiracOptGenericDhopSite(
      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
      int sF, int sU, const FermionField &in, FermionField &out);
  void DiracOptGenericDhopSiteDag(
      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
      int sF, int sU, const FermionField &in, FermionField &out);
  void DiracOptAsmDhopSite(
      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
      int sF, int sU, int Ls, int Ns, const FermionField &in,
      FermionField &out);
  void DiracOptAsmDhopSiteDag(
      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
      int sF, int sU, int Ls, int Ns, const FermionField &in,
      FermionField &out);
  void DiracOptHandDhopSite(
      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
      int sF, int sU, const FermionField &in, FermionField &out);
  void DiracOptHandDhopSiteDag(
      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
      int sF, int sU, const FermionField &in, FermionField &out);
 public:
  WilsonKernels(const ImplParams &p = ImplParams());
  };
  }
 }
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -26,68 +26,56 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
+*************************************************************************************/
-    /*  END LEGAL */
+/*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
-namespace QCD {
+  namespace QCD {
-
+    
  ///////////////////////////////////////////////////////////
  // Default to no assembler implementation
  ///////////////////////////////////////////////////////////
  template<class Impl>
  void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 #if defined(AVX512) 
-
+    
-
+    
-  ///////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////
-  // If we are AVX512 specialise the single precision routine
+    // If we are AVX512 specialise the single precision routine
-  ///////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////
-
+    
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
-
+    
-static Vector<vComplexF> signs;
+    static Vector<vComplexF> signs;
-
+    
-int setupSigns(void ){
+    int setupSigns(void ){
-  Vector<vComplexF> bother(2);
+      Vector<vComplexF> bother(2);
-  signs = bother;
+      signs = bother;
-  vrsign(signs[0]);
+      vrsign(signs[0]);
-  visign(signs[1]);
+      visign(signs[1]);
-  return 1;
+      return 1;
-}
+    }
-static int signInit = setupSigns();
+    static int signInit = setupSigns();
-
+  
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
-
+  
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
-
+  
 #undef KERNEL_DAG
-template<>
+    template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+      
 #define KERNEL_DAG
-template<>
+    template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
@@ -98,43 +86,22 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebesgue
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-
+				    
 #undef KERNEL_DAG
-template<>
+    template<>
-void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #define KERNEL_DAG
-template<>
+    template<>
-void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #endif
-
+  }
-
+}
 template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}