From aa20cc8b52e90bbe0bbc960e2ea8e49b7f0baa5b Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Fri, 9 Sep 2016 02:53:22 -0700
Subject: [PATCH 1/3] Fixing compilation error with AVX512 flag

---
 bootstrap.sh                               |  2 +-
 lib/qcd/action/fermion/WilsonKernels.h     | 19 ++++++++++++-
 lib/qcd/action/fermion/WilsonKernelsAsm.cc | 32 +++++++++++++++++++++-
 3 files changed, 50 insertions(+), 3 deletions(-)
diff --git a/bootstrap.sh b/bootstrap.sh
index 461eb121..f847b7ab 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -4,7 +4,7 @@ EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
 FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
 
 echo "-- deploying Eigen source..."
-wget ${EIGEN_URL}
+wget ${EIGEN_URL} --no-check-certificate
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
 rm `basename ${EIGEN_URL}`
 
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index b679d3f9..b551319b 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -96,7 +96,24 @@ namespace Grid {
      WilsonKernels(const ImplParams &p= ImplParams());
      
     };
-
+    
+    
+    template<class Impl>
+      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+      {
+	assert(0);
+      }
+    template<class Impl>
+      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+      {
+	assert(0);
+      }
+    
+    
   }
 }
 #endif
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index b443ccf9..ce592540 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -38,6 +38,7 @@ namespace QCD {
   ///////////////////////////////////////////////////////////
   // Default to no assembler implementation
   ///////////////////////////////////////////////////////////
+  /*
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
@@ -45,6 +46,14 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & l
 {
   assert(0);
 }
+template<class Impl>
+void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+  */
 
 #if defined(AVX512) 
 
@@ -116,7 +125,7 @@ void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st
 #endif
 
 
-
+/*
 template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
@@ -136,5 +145,26 @@ template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilIm
 template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+
+template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+
+template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+*/
 }}
 

From f76f281e58e5e703ea0cd9806b1f4c729014a785 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Fri, 9 Sep 2016 11:34:25 +0100
Subject: [PATCH 2/3] Cleaning files after fix

---
 lib/parallelIO/BinaryIO.h                  | 166 ++++++++++-----------
 lib/qcd/action/fermion/WilsonKernels.h     |  31 ++--
 lib/qcd/action/fermion/WilsonKernelsAsm.cc | 159 ++++++--------------
 3 files changed, 147 insertions(+), 209 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index 184209dc..5eddb57d 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -194,22 +194,22 @@ class BinaryIO {
 
       std::vector<int> site({x,y,z,t});
 
-      if ( grid->IsBoss() ) {
-	fin.read((char *)&file_object,sizeof(file_object));
-	bytes += sizeof(file_object);
-	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
+      if (grid->IsBoss()) {
+        fin.read((char *)&file_object, sizeof(file_object));
+        bytes += sizeof(file_object);
+        if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object));
 
-	munge(file_object,munged,csum);
+        munge(file_object, munged, csum);
       }
       // The boss who read the file has their value poked
       pokeSite(munged,Umu,site);
     }}}}
     timer.Stop();
     std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
 
     return csum;
   }
@@ -254,20 +254,20 @@ class BinaryIO {
 
       
       if ( grid->IsBoss() ) {
-	
-	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
+  
+  if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
+  if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
+  if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
+  if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
 
-	// NB could gather an xstrip as an optimisation.
-	fout.write((char *)&file_object,sizeof(file_object));
-	bytes+=sizeof(file_object);
+  // NB could gather an xstrip as an optimisation.
+  fout.write((char *)&file_object,sizeof(file_object));
+  bytes+=sizeof(file_object);
       }
     }}}}
     timer.Stop();
     std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 
     return csum;
   }
@@ -305,15 +305,15 @@ class BinaryIO {
       int l_idx=parallel.generator_idx(o_idx,i_idx);
 
       if( rank == grid->ThisRank() ){
-	//	std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
-	parallel.GetState(saved,l_idx);
+  //  std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
+  parallel.GetState(saved,l_idx);
       }
 
       grid->Broadcast(rank,(void *)&saved[0],bytes);
 
       if ( grid->IsBoss() ) {
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-	fout.write((char *)&saved[0],bytes);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  fout.write((char *)&saved[0],bytes);
       }
 
     }
@@ -355,14 +355,14 @@ class BinaryIO {
       int l_idx=parallel.generator_idx(o_idx,i_idx);
 
       if ( grid->IsBoss() ) {
-	fin.read((char *)&saved[0],bytes);
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  fin.read((char *)&saved[0],bytes);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
       }
 
       grid->Broadcast(0,(void *)&saved[0],bytes);
 
       if( rank == grid->ThisRank() ){
-	parallel.SetState(saved,l_idx);
+  parallel.SetState(saved,l_idx);
       }
 
     }
@@ -415,15 +415,15 @@ class BinaryIO {
 
       if ( d == 0 ) parallel[d] = 0;
       if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  range[d] = grid->_ldimensions[d];
+  start[d] = grid->_processor_coor[d]*range[d];
+  ioproc[d]= grid->_processor_coor[d];
       } else {
-	range[d] = grid->_gdimensions[d];
-	start[d] = 0;
-	ioproc[d]= 0;
+  range[d] = grid->_gdimensions[d];
+  start[d] = 0;
+  ioproc[d]= 0;
 
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
       }
       slice_vol = slice_vol * range[d];
     }
@@ -434,9 +434,9 @@ class BinaryIO {
       std::cout<< std::dec ;
       std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
       for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+  std::cout<< range[d];
+  if( d< grid->_ndimension-1 ) 
+    std::cout<< " x ";
       }
       std::cout << std::endl;
     }
@@ -463,7 +463,7 @@ class BinaryIO {
 
       // need to implement these loops in Nd independent way with a lexico conversion
     for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
       std::vector<int> tsite(nd); // temporary mixed up site
       std::vector<int> gsite(nd);
       std::vector<int> lsite(nd);
@@ -472,8 +472,8 @@ class BinaryIO {
       Lexicographic::CoorFromIndex(tsite,tlex,range);
 
       for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  gsite[d] = tsite[d]+start[d];               // global site
       }
 
       /////////////////////////
@@ -487,29 +487,29 @@ class BinaryIO {
       // iorank reads from the seek
       ////////////////////////////////
       if (myrank == iorank) {
-	
-	fin.seekg(offset+g_idx*sizeof(fileObj));
-	fin.read((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
-	
-	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
-	
-	munge(fileObj,siteObj,csum);
+  
+  fin.seekg(offset+g_idx*sizeof(fileObj));
+  fin.read((char *)&fileObj,sizeof(fileObj));
+  bytes+=sizeof(fileObj);
+  
+  if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+  
+  munge(fileObj,siteObj,csum);
 
-      }	
+      } 
 
       // Possibly do transport through pt2pt 
       if ( rank != iorank ) { 
-	if ( (myrank == rank) || (myrank==iorank) ) {
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
-	}
+  if ( (myrank == rank) || (myrank==iorank) ) {
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
+  }
       }
       // Poke at destination
       if ( myrank == rank ) {
-	  pokeLocalSite(siteObj,Umu,lsite);
+    pokeLocalSite(siteObj,Umu,lsite);
       }
       grid->Barrier(); // necessary?
     }
@@ -520,7 +520,7 @@ class BinaryIO {
 
     timer.Stop();
     std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
     
     return csum;
   }
@@ -558,15 +558,15 @@ class BinaryIO {
       if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
 
       if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  range[d] = grid->_ldimensions[d];
+  start[d] = grid->_processor_coor[d]*range[d];
+  ioproc[d]= grid->_processor_coor[d];
       } else {
-	range[d] = grid->_gdimensions[d];
-	start[d] = 0;
-	ioproc[d]= 0;
+  range[d] = grid->_gdimensions[d];
+  start[d] = 0;
+  ioproc[d]= 0;
 
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
       }
 
       slice_vol = slice_vol * range[d];
@@ -577,9 +577,9 @@ class BinaryIO {
       grid->GlobalSum(tmp);
       std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
       for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+  std::cout<< range[d];
+  if( d< grid->_ndimension-1 ) 
+    std::cout<< " x ";
       }
       std::cout << std::endl;
     }
@@ -610,7 +610,7 @@ class BinaryIO {
     // should aggregate a whole chunk and then write.
     // need to implement these loops in Nd independent way with a lexico conversion
     for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
       std::vector<int> tsite(nd); // temporary mixed up site
       std::vector<int> gsite(nd);
       std::vector<int> lsite(nd);
@@ -619,8 +619,8 @@ class BinaryIO {
       Lexicographic::CoorFromIndex(tsite,tlex,range);
 
       for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  gsite[d] = tsite[d]+start[d];               // global site
       }
 
 
@@ -640,26 +640,26 @@ class BinaryIO {
 
       // Pair of nodes may need to do pt2pt send
       if ( rank != iorank ) { // comms is necessary
-	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
-	  // Send to IOrank 
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
-	}
+  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
+    // Send to IOrank 
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
+  }
       }
 
       grid->Barrier(); // necessary?
 
       if (myrank == iorank) {
-	
-	munge(siteObj,fileObj,csum);
+  
+  munge(siteObj,fileObj,csum);
 
-	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
-	
-	fout.seekp(offset+g_idx*sizeof(fileObj));
-	fout.write((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
+  if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
+  
+  fout.seekp(offset+g_idx*sizeof(fileObj));
+  fout.write((char *)&fileObj,sizeof(fileObj));
+  bytes+=sizeof(fileObj);
       }
     }
 
@@ -668,7 +668,7 @@ class BinaryIO {
 
     timer.Stop();
     std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 
     return csum;
   }
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index b551319b..d796e235 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -97,23 +97,24 @@ namespace Grid {
      
     };
     
-    
+    ///////////////////////////////////////////////////////////
+    // Default to no assembler implementation
+    ///////////////////////////////////////////////////////////
     template<class Impl>
-      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-      {
-	assert(0);
-      }
+    void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    {
+      assert(0);
+    }
     template<class Impl>
-      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-      {
-	assert(0);
-      }
-    
-    
+    void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    {
+      assert(0);
+    }
+        
   }
 }
 #endif
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index ce592540..d2cb4285 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
 
@@ -26,77 +26,56 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
     See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 
 #include <Grid.h>
 
 namespace Grid {
-namespace QCD {
-
-
-  ///////////////////////////////////////////////////////////
-  // Default to no assembler implementation
-  ///////////////////////////////////////////////////////////
-  /*
-template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-  */
-
+  namespace QCD {
+    
 #if defined(AVX512) 
-
-
-  ///////////////////////////////////////////////////////////
-  // If we are AVX512 specialise the single precision routine
-  ///////////////////////////////////////////////////////////
-
+    
+    
+    ///////////////////////////////////////////////////////////
+    // If we are AVX512 specialise the single precision routine
+    ///////////////////////////////////////////////////////////
+    
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
-
-static Vector<vComplexF> signs;
-
-int setupSigns(void ){
-  Vector<vComplexF> bother(2);
-  signs = bother;
-  vrsign(signs[0]);
-  visign(signs[1]);
-  return 1;
-}
-static int signInit = setupSigns();
-
+    
+    static Vector<vComplexF> signs;
+    
+    int setupSigns(void ){
+      Vector<vComplexF> bother(2);
+      signs = bother;
+      vrsign(signs[0]);
+      visign(signs[1]);
+      return 1;
+    }
+    static int signInit = setupSigns();
+  
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
-
+  
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
-
+  
 #undef KERNEL_DAG
-template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    template<>
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+      
 #define KERNEL_DAG
-template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    template<>
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
@@ -107,64 +86,22 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebesgue
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-
+				    
 #undef KERNEL_DAG
-template<>
-void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    template<>
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #define KERNEL_DAG
-template<>
-void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    template<>
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #endif
-
-
-/*
-template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-
-template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-
-template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-
-template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-*/
-}}
+  }
+}
 

From 5df5d52d417decf8d9a229a674a9d4a082149767 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Mon, 12 Sep 2016 17:17:20 +0100
Subject: [PATCH 3/3] Fix for the Intel compiler

---
 lib/qcd/QCD.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/qcd/QCD.h b/lib/qcd/QCD.h
index ec11c837..0e9d3c17 100644
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -55,11 +55,14 @@ namespace QCD {
     //////////////////////////////////////////////////////////////////////////////
     // QCD iMatrix types
     // Index conventions:                            Lorentz x Spin x Colour
+    // note: static const int or constexpr will work for type deductions
+    //       with the intel compiler (up to version 17)
     //////////////////////////////////////////////////////////////////////////////
-    static const int ColourIndex = 2;
-    static const int SpinIndex   = 1;
-    static const int LorentzIndex= 0;
+    #define ColourIndex  2
+    #define SpinIndex    1
+    #define LorentzIndex 0
 
+  
     // Also should make these a named enum type
     static const int DaggerNo=0;
     static const int DaggerYes=1;