diff --git a/BLAS_benchmark/BatchBlasBench.cc b/BLAS_benchmark/BatchBlasBench.cc
index 77f9e75d..6db7160a 100644
--- a/BLAS_benchmark/BatchBlasBench.cc
+++ b/BLAS_benchmark/BatchBlasBench.cc
@@ -120,7 +120,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(ba
 	     cudaGetErrorString( err ));				\
       printf("File %s Line %d\n",__FILE__,__LINE__);			\
       fflush(stdout);							\
-      if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);		\
+      if (acceleratorAbortOnGpuError) GRID_ASSERT(err==cudaSuccess);		\
     }									\
   }
 
@@ -168,7 +168,7 @@ public:
     if ( (_Tp*)ptr == (_Tp *) NULL ) {
       printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
     }
-    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
+    GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
     return ptr;
   }
 
@@ -276,11 +276,11 @@ public:
   {
 #ifdef GRID_HIP
     auto err = hipDeviceSynchronize();
-    assert(err==hipSuccess);
+    GRID_ASSERT(err==hipSuccess);
 #endif
 #ifdef GRID_CUDA
     auto err = cudaDeviceSynchronize();
-    assert(err==cudaSuccess);
+    GRID_ASSERT(err==cudaSuccess);
 #endif
 #ifdef GRID_SYCL
     accelerator_barrier();
@@ -305,8 +305,8 @@ public:
   {
     RealD t2=usecond();
 
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    GRID_ASSERT(OpB!=GridBLAS_OP_T);
 
     int lda = m; // m x k column major
     int ldb = k; // k x n column major
@@ -341,7 +341,7 @@ public:
 			    (hipblasDoubleComplex *) Bkn, ldb,
 			    (hipblasDoubleComplex *) &beta_p[0],
 			    (hipblasDoubleComplex *) Cmn, ldc);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     cublasOperation_t hOpA;
@@ -361,7 +361,7 @@ public:
 			   (cuDoubleComplex *) Bkn, ldb,
 			   (cuDoubleComplex *) &beta_p[0],
 			   (cuDoubleComplex *) Cmn, ldc);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
       int64_t m64=m;
@@ -433,8 +433,8 @@ public:
   {
     RealD t2=usecond();
 
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    GRID_ASSERT(OpB!=GridBLAS_OP_T);
 
     int lda = m; // m x k column major
     int ldb = k; // k x n column major
@@ -469,7 +469,7 @@ public:
 			    (hipblasComplex *) Bkn, ldb,
 			    (hipblasComplex *) &beta_p[0],
 			    (hipblasComplex *) Cmn, ldc);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     cublasOperation_t hOpA;
@@ -489,7 +489,7 @@ public:
 			   (cuComplex *) Bkn, ldb,
 			   (cuComplex *) &beta_p[0],
 			   (cuComplex *) Cmn, ldc);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
       int64_t m64=m;
@@ -595,11 +595,11 @@ public:
   {
     RealD t2=usecond();
     int32_t batchCount = Amk.size();
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
+    GRID_ASSERT(Bkn.size()==batchCount);
+    GRID_ASSERT(Cmn.size()==batchCount);
 
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    GRID_ASSERT(OpB!=GridBLAS_OP_T);
 
     int lda = m; // m x k column major
     int ldb = k; // k x n column major
@@ -636,7 +636,7 @@ public:
 				   (hipblasDoubleComplex **)&Cmn[0], ldc,
 				   batchCount);
     //	 std::cout << " hipblas return code " <<(int)err<<std::endl;
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     cublasOperation_t hOpA;
@@ -657,7 +657,7 @@ public:
 				  (cuDoubleComplex *) &beta_p[0],
 				  (cuDoubleComplex **)&Cmn[0], ldc,
 				  batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
       int64_t m64=m;
@@ -804,8 +804,8 @@ public:
     RealD t2=usecond();
     int32_t batchCount = Amk.size();
 
-    assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
-    assert(OpB!=GridBLAS_OP_T);
+    GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
+    GRID_ASSERT(OpB!=GridBLAS_OP_T);
 
     int lda = m; // m x k column major
     int ldb = k; // k x n column major
@@ -821,8 +821,8 @@ public:
     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
     RealD t0=usecond();
 
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
+    GRID_ASSERT(Bkn.size()==batchCount);
+    GRID_ASSERT(Cmn.size()==batchCount);
 #ifdef GRID_HIP
     hipblasOperation_t hOpA;
     hipblasOperation_t hOpB;
@@ -843,7 +843,7 @@ public:
 				   (hipblasComplex **)&Cmn[0], ldc,
 				   batchCount);
 
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     cublasOperation_t hOpA;
@@ -864,7 +864,7 @@ public:
 				  (cuComplex *) &beta_p[0],
 				  (cuComplex **)&Cmn[0], ldc,
 				  batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
       int64_t m64=m;
diff --git a/Grid/GridStd.h b/Grid/GridStd.h
index d0a8124a..d53a11a1 100644
--- a/Grid/GridStd.h
+++ b/Grid/GridStd.h
@@ -1,9 +1,17 @@
 #ifndef GRID_STD_H
 #define GRID_STD_H
 
+///////////////////
+// Grid config
+///////////////////
+#include "Config.h"
+
 ///////////////////
 // Std C++ dependencies
 ///////////////////
+#define _NBACKTRACE (256)
+extern void * Grid_backtrace_buffer[_NBACKTRACE];
+
 #include <cassert>
 #include <complex>
 #include <memory>
@@ -15,7 +23,9 @@
 #include <random>
 #include <functional>
 #include <stdio.h>
+#include <string.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include <strings.h>
 #include <stdio.h>
 #include <signal.h>
@@ -23,11 +33,36 @@
 #include <sys/time.h>
 #include <chrono>
 #include <zlib.h>
+#ifdef HAVE_EXECINFO_H
+#include <execinfo.h>
+#endif
+
+void GridAbort(void);
+
+#define ASSLOG(A) ::write(STDERR_FILENO,A,::strlen(A));
+#ifdef HAVE_EXECINFO_H
+#define GRID_ASSERT(b) if(!(b)) {					\
+    fflush(stdout); \
+    ASSLOG(" GRID_ASSERT failure: ");					\
+    ASSLOG(__FILE__);							\
+    ASSLOG(" : ");							\
+    ASSLOG(#b);								\
+    ASSLOG(" : ");							\
+    int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);		\
+    backtrace_symbols_fd(Grid_backtrace_buffer,symbols,STDERR_FILENO);	\
+    GridAbort();							\
+  };
+#else
+#define GRID_ASSERT(b) if(!(b)) {					\
+    ASSLOG(" GRID_ASSERT failure: ");					\
+    ASSLOG(__FILE__);							\
+    ASSLOG(" : ");							\
+    ASSLOG(#b);								\
+    ASSLOG(" : ");							\
+    GridAbort();							\
+  };
+#endif
 
-///////////////////
-// Grid config
-///////////////////
-#include "Config.h"
 
 #ifdef TOFU
 #undef GRID_COMMS_THREADS
diff --git a/Grid/Makefile.am b/Grid/Makefile.am
index 8472dd71..45d391ac 100644
--- a/Grid/Makefile.am
+++ b/Grid/Makefile.am
@@ -54,21 +54,25 @@ Version.h: version-cache
 include Make.inc
 include Eigen.inc
 
-extra_sources+=$(WILS_FERMION_FILES)
-extra_sources+=$(STAG_FERMION_FILES)
+if BUILD_FERMION_INSTANTIATIONS
+  extra_sources+=$(WILS_FERMION_FILES)
+  extra_sources+=$(STAG_FERMION_FILES)
 if BUILD_ZMOBIUS
-  extra_sources+=$(ZWILS_FERMION_FILES)
+    extra_sources+=$(ZWILS_FERMION_FILES)
 endif
 if BUILD_GPARITY
-  extra_sources+=$(GP_FERMION_FILES)
+    extra_sources+=$(GP_FERMION_FILES)
 endif
 if BUILD_FERMION_REPS
-  extra_sources+=$(ADJ_FERMION_FILES)
-  extra_sources+=$(TWOIND_FERMION_FILES)
+    extra_sources+=$(ADJ_FERMION_FILES)
+    extra_sources+=$(TWOIND_FERMION_FILES)
 endif
 if BUILD_SP
     extra_sources+=$(SP_FERMION_FILES)
-    extra_sources+=$(SP_TWOIND_FERMION_FILES)
+if BUILD_FERMION_REPS
+      extra_sources+=$(SP_TWOIND_FERMION_FILES)
+endif
+endif
 endif
 
 lib_LIBRARIES = libGrid.a
diff --git a/Grid/Namespace.h b/Grid/Namespace.h
index c42b46b3..be90e2c8 100644
--- a/Grid/Namespace.h
+++ b/Grid/Namespace.h
@@ -29,8 +29,8 @@ directory
 #pragma once
 
 #include <type_traits>
-#include <cassert>
 #include <exception>
+#include <cassert>
 
 #define NAMESPACE_BEGIN(A) namespace A {
 #define NAMESPACE_END(A)   }
diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h
index a3b0b026..adac8fec 100644
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -51,6 +51,8 @@ NAMESPACE_CHECK(approx);
 #include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
 #include <Grid/algorithms/deflation/MultiRHSDeflation.h>
 #include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
+// Not really deflation, but useful
+#include <Grid/algorithms/blas/MomentumProject.h>
 NAMESPACE_CHECK(deflation);
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 NAMESPACE_CHECK(ConjGrad);
@@ -84,5 +86,6 @@ NAMESPACE_CHECK(multigrid);
 
 #include <Grid/algorithms/iterative/KrylovSchur.h>
 #include <Grid/algorithms/iterative/Arnoldi.h>
+#include <Grid/algorithms/iterative/LanczosBidiagonalization.h>
 
 #endif
diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h
index 329d1d46..de621387 100644
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@@ -28,6 +28,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_
 
+#ifdef GRID_CUDA
+#include <cufft.h>
+#endif
+
+#ifdef GRID_HIP
+#include <hipfft/hipfft.h>
+#endif
+
+#if !defined(GRID_CUDA) && !defined(GRID_HIP)
 #ifdef HAVE_FFTW
 #if defined(USE_MKL) || defined(GRID_SYCL)
 #include <fftw/fftw3.h>
@@ -35,88 +44,190 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <fftw3.h>
 #endif
 #endif
+#endif
 
 NAMESPACE_BEGIN(Grid);
 
-template<class scalar> struct FFTW { };
+#ifndef FFTW_FORWARD
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+#define FFTW_ESTIMATE (0)
+#endif
 
-#ifdef HAVE_FFTW	
+template<class scalar> struct FFTW {
+};
+
+#ifdef GRID_HIP
 template<> struct FFTW<ComplexD> {
 public:
+  static const int forward=FFTW_FORWARD;
+  static const int backward=FFTW_BACKWARD;
+  typedef hipfftDoubleComplex FFTW_scalar;
+  typedef hipfftHandle        FFTW_plan;
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
+				      int istride, int idist,		
+				      FFTW_scalar *out, int *onembed,		
+				      int ostride, int odist,		
+				      int sign, unsigned flags) {
+    FFTW_plan p;
+    auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
+    GRID_ASSERT(rv==HIPFFT_SUCCESS);
+    return p;
+  }	  
+    
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
+    hipfftResult rv;
+    if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
+    else                   rv =hipfftExecZ2Z(p,in,out,HIPFFT_BACKWARD);
+    accelerator_barrier();
+    GRID_ASSERT(rv==HIPFFT_SUCCESS);
+  }
+  inline static void fftw_destroy_plan(const FFTW_plan p) {
+    hipfftDestroy(p);
+  }
+};
+template<> struct FFTW<ComplexF> {
+public:
+  static const int forward=FFTW_FORWARD;
+  static const int backward=FFTW_BACKWARD;
+  typedef hipfftComplex      FFTW_scalar;
+  typedef hipfftHandle        FFTW_plan;
 
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
+				      int istride, int idist,		
+				      FFTW_scalar *out, int *onembed,		
+				      int ostride, int odist,		
+				      int sign, unsigned flags) {
+    FFTW_plan p;
+    auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
+    GRID_ASSERT(rv==HIPFFT_SUCCESS);
+    return p;
+  }	  
+    
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
+    hipfftResult rv;
+    if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
+    else                   rv =hipfftExecC2C(p,in,out,HIPFFT_BACKWARD);
+    accelerator_barrier();
+    GRID_ASSERT(rv==HIPFFT_SUCCESS);
+  }
+  inline static void fftw_destroy_plan(const FFTW_plan p) {
+    hipfftDestroy(p);
+  }
+};
+#endif
+
+#ifdef GRID_CUDA
+template<> struct FFTW<ComplexD> {
+public:
+  static const int forward=FFTW_FORWARD;
+  static const int backward=FFTW_BACKWARD;
+  typedef cufftDoubleComplex FFTW_scalar;
+  typedef cufftHandle        FFTW_plan;
+
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
+				      int istride, int idist,		
+				      FFTW_scalar *out, int *onembed,		
+				      int ostride, int odist,		
+				      int sign, unsigned flags) {
+    FFTW_plan p;
+    cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
+    return p;
+  }	  
+    
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
+    if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
+    else                   cufftExecZ2Z(p,in,out,CUFFT_INVERSE);
+    accelerator_barrier();
+  }
+  inline static void fftw_destroy_plan(const FFTW_plan p) {
+    cufftDestroy(p);
+  }
+};
+template<> struct FFTW<ComplexF> {
+public:
+  static const int forward=FFTW_FORWARD;
+  static const int backward=FFTW_BACKWARD;
+  typedef cufftComplex FFTW_scalar;
+  typedef cufftHandle        FFTW_plan;
+
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
+				      int istride, int idist,		
+				      FFTW_scalar *out, int *onembed,		
+				      int ostride, int odist,		
+				      int sign, unsigned flags) {
+    FFTW_plan p;
+    cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
+    return p;
+  }	  
+    
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
+    if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
+    else                   cufftExecC2C(p,in,out,CUFFT_INVERSE);
+    accelerator_barrier();
+  }
+  inline static void fftw_destroy_plan(const FFTW_plan p) {
+    cufftDestroy(p);
+  }
+};
+#endif
+
+#if !defined(GRID_CUDA) && !defined(GRID_HIP)
+#ifdef HAVE_FFTW
+template<> struct FFTW<ComplexD> {
+public:
   typedef fftw_complex FFTW_scalar;
   typedef fftw_plan    FFTW_plan;
-
-  static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
-				      FFTW_scalar *in, const int *inembed,		
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
 				      int istride, int idist,		
-				      FFTW_scalar *out, const int *onembed,		
+				      FFTW_scalar *out, int *onembed,		
 				      int ostride, int odist,		
 				      int sign, unsigned flags) {
     return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
   }	  
     
-  static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
-    ::fftw_flops(p,add,mul,fmas);
-  }
-
-  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
     ::fftw_execute_dft(p,in,out);
   }
   inline static void fftw_destroy_plan(const FFTW_plan p) {
     ::fftw_destroy_plan(p);
   }
 };
-
 template<> struct FFTW<ComplexF> {
 public:
-
   typedef fftwf_complex FFTW_scalar;
   typedef fftwf_plan    FFTW_plan;
-
-  static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
-				      FFTW_scalar *in, const int *inembed,		
+  static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
+				      FFTW_scalar *in, int *inembed,		
 				      int istride, int idist,		
-				      FFTW_scalar *out, const int *onembed,		
+				      FFTW_scalar *out, int *onembed,		
 				      int ostride, int odist,		
 				      int sign, unsigned flags) {
     return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
   }	  
     
-  static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
-    ::fftwf_flops(p,add,mul,fmas);
-  }
-
-  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+  inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
     ::fftwf_execute_dft(p,in,out);
   }
   inline static void fftw_destroy_plan(const FFTW_plan p) {
     ::fftwf_destroy_plan(p);
   }
 };
-
 #endif
-
-#ifndef FFTW_FORWARD
-#define FFTW_FORWARD (-1)
-#define FFTW_BACKWARD (+1)
 #endif
 
 class FFT {
 private:
     
-  GridCartesian *vgrid;
-  GridCartesian *sgrid;
-    
-  int Nd;
   double flops;
   double flops_call;
   uint64_t usec;
     
-  Coordinate dimensions;
-  Coordinate processors;
-  Coordinate processor_coor;
-    
 public:
     
   static const int forward=FFTW_FORWARD;
@@ -126,31 +237,25 @@ public:
   double MFlops(void) {return flops/usec;}
   double USec(void)   {return (double)usec;}    
 
-  FFT ( GridCartesian * grid ) :
-    vgrid(grid),
-    Nd(grid->_ndimension),
-    dimensions(grid->_fdimensions),
-    processors(grid->_processors),
-    processor_coor(grid->_processor_coor)
+  FFT ( GridCartesian * grid ) 
   {
     flops=0;
     usec =0;
-    Coordinate layout(Nd,1);
-    sgrid = new GridCartesian(dimensions,layout,processors,*grid);
   };
     
   ~FFT ( void)  {
-    delete sgrid;
+    //    delete sgrid;
   }
     
   template<class vobj>
   void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
 
-    conformable(result.Grid(),vgrid);
-    conformable(source.Grid(),vgrid);
-    Lattice<vobj> tmp(vgrid);
-    tmp = source;
-    for(int d=0;d<Nd;d++){
+    //    vgrid=result.Grid();
+    //    conformable(result.Grid(),vgrid);
+    //    conformable(source.Grid(),vgrid);
+    const int Ndim = source.Grid()->Nd();
+    Lattice<vobj> tmp = source;
+    for(int d=0;d<Ndim;d++){
       if( mask[d] ) {
 	FFT_dim(result,tmp,d,sign);
 	tmp=result;
@@ -160,62 +265,70 @@ public:
 
   template<class vobj>
   void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
-    Coordinate mask(Nd,1);
+    const int Ndim = source.Grid()->Nd();
+    Coordinate mask(Ndim,1);
     FFT_dim_mask(result,source,mask,sign);
   }
 
 
   template<class vobj>
   void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
-#ifndef HAVE_FFTW
-    std::cerr << "FFTW is not compiled but is called"<<std::endl;
-    assert(0);
-#else
-    conformable(result.Grid(),vgrid);
-    conformable(source.Grid(),vgrid);
+    const int Ndim = source.Grid()->Nd();
+    GridBase *grid = source.Grid();
+    conformable(result.Grid(),source.Grid());
 
-    int L = vgrid->_ldimensions[dim];
-    int G = vgrid->_fdimensions[dim];
-      
-    Coordinate layout(Nd,1);
-    Coordinate pencil_gd(vgrid->_fdimensions);
-      
-    pencil_gd[dim] = G*processors[dim];
-      
-    // Pencil global vol LxLxGxLxL per node
-    GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
+    int L = grid->_ldimensions[dim];
+    int G = grid->_fdimensions[dim];
       
+    Coordinate layout(Ndim,1);
+    
     // Construct pencils
     typedef typename vobj::scalar_object sobj;
-    typedef typename sobj::scalar_type   scalar;
+    typedef typename vobj::scalar_type   scalar;
+    typedef typename vobj::scalar_type   scalar_type;
+    typedef typename vobj::vector_type   vector_type;
       
-    Lattice<sobj> pgbuf(&pencil_g);
-    autoView(pgbuf_v , pgbuf, CpuWrite);
     //std::cout << "CPU view" << std::endl;
     
     typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
     typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
       
     int Ncomp = sizeof(sobj)/sizeof(scalar);
-    int Nlow  = 1;
+    int64_t Nlow  = 1;
+    int64_t Nhigh = 1;
+
     for(int d=0;d<dim;d++){
-      Nlow*=vgrid->_ldimensions[d];
+      Nlow*=grid->_ldimensions[d];
     }
+    for(int d=dim+1;d<Ndim;d++){
+      Nhigh*=grid->_ldimensions[d];
+    }
+    int64_t Nperp=Nlow*Nhigh;
+    
+    deviceVector<scalar> pgbuf; // Layout is [perp][component][dim]
+    pgbuf.resize(Nperp*Ncomp*G);
+    scalar *pgbuf_v = &pgbuf[0];
       
     int rank = 1;  /* 1d transforms */
     int n[] = {G}; /* 1d transforms of length G */
-    int howmany = Ncomp;
+    int howmany = Ncomp * Nperp;
     int odist,idist,istride,ostride;
-    idist   = odist   = 1;          /* Distance between consecutive FT's */
-    istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
+    idist   = odist   = G;            /* Distance between consecutive FT's */
+    istride = ostride = 1;            /* Distance between two elements in the same FT */
     int *inembed = n, *onembed = n;
       
     scalar div;
     if ( sign == backward ) div = 1.0/G;
     else if ( sign == forward ) div = 1.0;
-    else assert(0);
-      
-    //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    else GRID_ASSERT(0);
+
+    double t_pencil=0;
+    double t_fft   =0;
+    double t_total =-usecond();
+    //    std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
+    /*
+     *
+     */
     FFTW_plan p;
     {
       FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -229,72 +342,154 @@ public:
     }
       
     // Barrel shift and collect global pencil
-    //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
-    Coordinate lcoor(Nd), gcoor(Nd);
+    //    std::cout << GridLogPerformance<<"Making pencil" << std::endl;
+    Coordinate lcoor(Ndim), gcoor(Ndim);
+    double t_copy=0;
+    double t_shift=0;
+    t_pencil = -usecond();
     result = source;
-    int pc = processor_coor[dim];
+    int pc = grid->_processor_coor[dim];
+
+    const Coordinate ldims = grid->_ldimensions;
+    const Coordinate rdims = grid->_rdimensions;
+    const Coordinate sdims = grid->_simd_layout;
+
+    Coordinate processors = grid->_processors;
+    Coordinate pgdims(Ndim);
+    pgdims[0] = G;
+    for(int d=0, dd=1;d<Ndim;d++){
+      if ( d!=dim ) pgdims[dd++] = ldims[d];
+    }
+    int64_t pgvol=1;
+    for(int d=0;d<Ndim;d++) pgvol*=pgdims[d];
+    
+    const int Nsimd = vobj::Nsimd();
     for(int p=0;p<processors[dim];p++) {
+      t_copy-=usecond();
+      autoView(r_v,result,AcceleratorRead);
+      accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
+#ifdef GRID_SIMT
       {
-	autoView(r_v,result,CpuRead);
-	autoView(p_v,pgbuf,CpuWrite);
-	thread_for(idx, sgrid->lSites(),{
-          Coordinate cbuf(Nd);
-          sobj s;
-	  sgrid->LocalIndexToLocalCoor(idx,cbuf);
-	  peekLocalSite(s,r_v,cbuf);
-	  cbuf[dim]+=((pc+p) % processors[dim])*L;
-	  pokeLocalSite(s,p_v,cbuf);
-        });
+	int lane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+      for(int lane=0;lane<Nsimd;lane++) {
+#endif
+	Coordinate icoor;
+	Coordinate ocoor;
+	Coordinate pgcoor;
+
+	Lexicographic::CoorFromIndex(icoor,lane,sdims);
+	Lexicographic::CoorFromIndex(ocoor,idx,rdims);
+
+	pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p)%processors[dim])*L;
+	for(int d=0,dd=1;d<Ndim;d++){
+	  if ( d!=dim ) {
+	    pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
+	    dd++;
+	  }
+	}
+
+	// Map coordinates in lattice layout to FFTW index
+	int64_t pgidx;
+	Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
+
+	vector_type *from = (vector_type *)&r_v[idx];
+	scalar_type stmp;
+	for(int w=0;w<Ncomp;w++){
+	  int64_t pg_idx = pgidx + w*pgvol;
+	  stmp = getlane(from[w], lane);
+	  pgbuf_v[pg_idx] = stmp;
+	}
+#ifdef GRID_SIMT
       }
+#else
+      }
+#endif
+      });
+
+      t_copy+=usecond();
       if (p != processors[dim] - 1) {
-	result = Cshift(result,dim,L);
+	Lattice<vobj> temp(grid);
+	t_shift-=usecond();
+	temp = Cshift(result,dim,L); result = temp;
+	t_shift+=usecond();
       }
     }
+    t_pencil += usecond();
       
-    //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
-    // Loop over orthog coords
-    int NN=pencil_g.lSites();
-    GridStopWatch timer;
-    timer.Start();
-    thread_for( idx,NN,{
-        Coordinate cbuf(Nd);
-	pencil_g.LocalIndexToLocalCoor(idx, cbuf);
-	if ( cbuf[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
-	  FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
-	  FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
-	  FFTW<scalar>::fftw_execute_dft(p,in,out);
-	}
-    });
-    timer.Stop();
-      
+    FFTW_scalar *in = (FFTW_scalar *)pgbuf_v;
+    FFTW_scalar *out= (FFTW_scalar *)pgbuf_v;
+    t_fft = -usecond();
+    FFTW<scalar>::fftw_execute_dft(p,in,out,sign);
+    t_fft += usecond();
+    
     // performance counting
-    double add,mul,fma;
-    FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
-    flops_call = add+mul+2.0*fma;
-    usec += timer.useconds();
-    flops+= flops_call*NN;
-      
-    //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
-    // writing out result
+    flops_call = 5.0*howmany*G*log2(G);
+    usec = t_fft;
+    flops= flops_call;
+
+    result = Zero();
+    
+    double t_insert = -usecond();
     {
-      autoView(pgbuf_v,pgbuf,CpuRead);
-      autoView(result_v,result,CpuWrite);
-      thread_for(idx,sgrid->lSites(),{
-	Coordinate clbuf(Nd), cgbuf(Nd);
-	sobj s;
-	sgrid->LocalIndexToLocalCoor(idx,clbuf);
-	cgbuf = clbuf;
-	cgbuf[dim] = clbuf[dim]+L*pc;
-	peekLocalSite(s,pgbuf_v,cgbuf);
-	pokeLocalSite(s,result_v,clbuf);
+      autoView(r_v,result,AcceleratorWrite);
+      accelerator_for(idx,grid->oSites(),Nsimd,{
+#ifdef GRID_SIMT
+      {
+	int lane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+      for(int lane=0;lane<Nsimd;lane++) {
+#endif
+	Coordinate icoor(Ndim);
+	Coordinate ocoor(Ndim);
+	Coordinate pgcoor(Ndim);
+
+	Lexicographic::CoorFromIndex(icoor,lane,sdims);
+	Lexicographic::CoorFromIndex(ocoor,idx,rdims);
+
+	pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
+	for(int d=0,dd=1;d<Ndim;d++){
+	  if ( d!=dim ) {
+	    pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
+	    dd++;
+	  }
+	}
+	// Map coordinates in lattice layout to FFTW index
+	int64_t pgidx;
+	Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
+
+	vector_type *to = (vector_type *)&r_v[idx];
+	scalar_type stmp;
+	for(int w=0;w<Ncomp;w++){
+	  int64_t pg_idx = pgidx + w*pgvol;
+	  stmp = pgbuf_v[pg_idx];
+	  putlane(to[w], stmp, lane);
+	}
+	
+#ifdef GRID_SIMT
+      }
+#else
+      }
+#endif
       });
     }
+
     result = result*div;
-      
-    //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
+
+    t_insert +=usecond();
+    
     // destroying plan
     FFTW<scalar>::fftw_destroy_plan(p);
-#endif
+
+    t_total +=usecond();
+
+    std::cout <<GridLogPerformance<< " FFT took   "<<t_total/1.0e6 <<" s" << std::endl;
+    std::cout <<GridLogPerformance<< " FFT pencil "<<t_pencil/1.0e6 <<" s" << std::endl;
+    std::cout <<GridLogPerformance<< "  of which copy "<<t_copy/1.0e6 <<" s" << std::endl;
+    std::cout <<GridLogPerformance<< "  of which shift"<<t_shift/1.0e6 <<" s" << std::endl;
+    std::cout <<GridLogPerformance<< " FFT kernels "<<t_fft/1.0e6 <<" s" << std::endl;
+    std::cout <<GridLogPerformance<< " FFT insert  "<<t_insert/1.0e6 <<" s" << std::endl;
+    
   }
 };
 
diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h
index 5c171ff2..d99c6dd5 100644
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -64,7 +64,7 @@ public:
 //
 // I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
 // while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
-// with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
+// with an GRID_ASSERT trap in the non-herm. This isn't right; there must be a better C++ way to
 // do it, but I fear it required multiple inheritance and mixed in abstract base classes
 /////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -148,22 +148,22 @@ public:
   // Support for coarsening to a multigrid
   void OpDiag (const Field &in, Field &out) {
     _Mat.Mdiag(in,out);
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
     _Mat.Mdir(in,out,dir,disp);
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
+    GRID_ASSERT(0);
   };
   void Op     (const Field &in, Field &out){
     _Mat.M(in,out);
-    assert(0);
+    GRID_ASSERT(0);
   }
   void AdjOp     (const Field &in, Field &out){
     _Mat.Mdag(in,out);
-    assert(0);
+    GRID_ASSERT(0);
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
     HermOp(in,out);
@@ -188,13 +188,13 @@ public:
   ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
   // Support for coarsening to a multigrid
   void OpDiag (const Field &in, Field &out) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
+    GRID_ASSERT(0);
   };
   void Op     (const Field &in, Field &out){
     HermOp(in,out);
@@ -271,10 +271,10 @@ public:
     _Mat.Mdag(in,out);
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    assert(0);
+    GRID_ASSERT(0);
   }
   void HermOp(const Field &in, Field &out){
-    assert(0);
+    GRID_ASSERT(0);
   }
 };
 template<class Matrix,class Field>
@@ -303,10 +303,10 @@ public:
     out = out + shift * in;
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    assert(0);
+    GRID_ASSERT(0);
   }
   void HermOp(const Field &in, Field &out){
-    assert(0);
+    GRID_ASSERT(0);
   }
 };
 
@@ -345,13 +345,13 @@ class SchurOperatorBase :  public LinearOperatorBase<Field> {
   }
   // Support for coarsening to a multigrid
   void OpDiag (const Field &in, Field &out) {
-    assert(0); // must coarsen the unpreconditioned system
+    GRID_ASSERT(0); // must coarsen the unpreconditioned system
   }
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
+    GRID_ASSERT(0);
   };
 };
 template<class Matrix,class Field>
@@ -447,10 +447,10 @@ class NonHermitianSchurOperatorBase :  public LinearOperatorBase<Field>
     MpcDag(tmp,out);
   }
   virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   virtual void HermOp(const Field& in, Field& out) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void Op(const Field& in, Field& out) {
     Mpc(in, out);
@@ -460,13 +460,13 @@ class NonHermitianSchurOperatorBase :  public LinearOperatorBase<Field>
   }
   // Support for coarsening to a multigrid
   void OpDiag(const Field& in, Field& out) {
-    assert(0); // must coarsen the unpreconditioned system
+    GRID_ASSERT(0); // must coarsen the unpreconditioned system
   }
   void OpDir(const Field& in, Field& out, int dir, int disp) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDirAll(const Field& in, std::vector<Field>& out){
-    assert(0);
+    GRID_ASSERT(0);
   };
 };
 
@@ -580,7 +580,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
  public:
   SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
   { 
-    assert( _Mat.isTrivialEE() );
+    GRID_ASSERT( _Mat.isTrivialEE() );
     mass = _Mat.Mass();
   }
   virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
@@ -611,7 +611,7 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
     Mpc(in,out);
   }
   virtual void MpcDagMpc(const Field &in, Field &out) {
-    assert(0);// Never need with staggered
+    GRID_ASSERT(0);// Never need with staggered
   }
 };
 template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
@@ -623,7 +623,7 @@ template<class Field> class OperatorFunction {
 public:
   virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
   virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
-    assert(in.size()==out.size());
+    GRID_ASSERT(in.size()==out.size());
     for(int k=0;k<in.size();k++){
       (*this)(Linop,in[k],out[k]);
     }
@@ -637,7 +637,7 @@ public:
 
   virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
   {
-    assert(in.size() == out.size());
+    GRID_ASSERT(in.size() == out.size());
 
     for (unsigned int i = 0; i < in.size(); ++i)
     {
diff --git a/Grid/algorithms/approx/Remez.cc b/Grid/algorithms/approx/Remez.cc
index 6fbaadc9..19f7742e 100644
--- a/Grid/algorithms/approx/Remez.cc
+++ b/Grid/algorithms/approx/Remez.cc
@@ -121,7 +121,7 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
   // Reallocate arrays, since degree has changed
   if (num_degree != n || den_degree != d) allocate(num_degree,den_degree);
 
-  assert(a_len<=SUM_MAX);
+  GRID_ASSERT(a_len<=SUM_MAX);
 
   step = new bigfloat[num_degree+den_degree+2];
 
@@ -151,9 +151,9 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
     equations();
     if (delta < tolerance) {
       std::cout<<"Delta too small, try increasing precision\n";
-      assert(0);
+      GRID_ASSERT(0);
     };    
-    assert( delta>= tolerance);
+    GRID_ASSERT( delta>= tolerance);
 
     search(step);
   }
diff --git a/Grid/algorithms/approx/Remez.h b/Grid/algorithms/approx/Remez.h
index 71b1093b..85720b5a 100644
--- a/Grid/algorithms/approx/Remez.h
+++ b/Grid/algorithms/approx/Remez.h
@@ -134,7 +134,7 @@ class AlgRemez
   virtual ~AlgRemez();
 
   int getDegree(void){ 
-    assert(n==d);
+    GRID_ASSERT(n==d);
     return n;
   }
   // Reset the bounds of the approximation
diff --git a/Grid/algorithms/approx/RemezGeneral.cc b/Grid/algorithms/approx/RemezGeneral.cc
index e41b4ed2..c534aba7 100644
--- a/Grid/algorithms/approx/RemezGeneral.cc
+++ b/Grid/algorithms/approx/RemezGeneral.cc
@@ -28,11 +28,11 @@ void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyTy
   pow_n = num_degree;
   pow_d = den_degree;
 
-  if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0);
-  if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0);
+  if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) GRID_ASSERT(0);
+  if(pow_n % 2 == 1 && num_type_in == PolyType::Even) GRID_ASSERT(0);
 
-  if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0);
-  if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0);
+  if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) GRID_ASSERT(0);
+  if(pow_d % 2 == 1 && den_type_in == PolyType::Even) GRID_ASSERT(0);
 
   num_type = num_type_in;
   den_type = den_type_in;
@@ -112,9 +112,9 @@ double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degre
     equations();
     if (delta < tolerance) {
       std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
-      assert(0);
+      GRID_ASSERT(0);
     };    
-    assert( delta>= tolerance );
+    GRID_ASSERT( delta>= tolerance );
 
     search();
   }
@@ -278,7 +278,7 @@ void AlgRemezGeneral::equations(){
       if(num_pows[j] != -1){ *aa++ = z; t++; }
       z *= x;
     }
-    assert(t == n+1);
+    GRID_ASSERT(t == n+1);
 
     z = (bigfloat)1l;
     t = 0;
@@ -286,7 +286,7 @@ void AlgRemezGeneral::equations(){
       if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
       z *= x;
     }
-    assert(t == d);
+    GRID_ASSERT(t == d);
 
     B[i] = y * z;		// Right hand side vector
   }
diff --git a/Grid/algorithms/approx/RemezGeneral.h b/Grid/algorithms/approx/RemezGeneral.h
index 92553ca5..f83d3c8f 100644
--- a/Grid/algorithms/approx/RemezGeneral.h
+++ b/Grid/algorithms/approx/RemezGeneral.h
@@ -106,7 +106,7 @@ class AlgRemezGeneral{
 		  bigfloat (*f)(bigfloat x, void *data), void *data);
 
   inline int getDegree(void) const{ 
-    assert(n==d);
+    GRID_ASSERT(n==d);
     return n;
   }
   // Reset the bounds of the approximation
diff --git a/Grid/algorithms/approx/ZMobius.cc b/Grid/algorithms/approx/ZMobius.cc
index 65af901f..a7d7d282 100644
--- a/Grid/algorithms/approx/ZMobius.cc
+++ b/Grid/algorithms/approx/ZMobius.cc
@@ -74,7 +74,7 @@ bigfloat epsilonMobius(bigfloat x, void* data){
 void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
 			 const std::vector<RealD> &omega_in, const int Ls_in,
 			 const RealD lambda_bound){
-  assert(omega_in.size() == Ls_in);
+  GRID_ASSERT(omega_in.size() == Ls_in);
   omega_out.resize(Ls_out);
 
   //Use the Remez algorithm to generate the appropriate rational polynomial
diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h
index c1025b59..580e8166 100644
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -28,6 +28,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
 #pragma once
 
 #ifdef GRID_HIP
+#include <hip/hip_version.h>
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_CUDA
@@ -109,8 +110,9 @@ public:
     case GridBLAS_PRECISION_TF32:
       return CUBLAS_COMPUTE_32F_FAST_TF32;
     default:
-      assert(0);
+      GRID_ASSERT(0);
     }
+    return CUBLAS_COMPUTE_32F_FAST_16F;
   }
 #endif
   // Force construct once
@@ -134,11 +136,11 @@ public:
   {
 #ifdef GRID_HIP
     auto err = hipDeviceSynchronize();
-    assert(err==hipSuccess);
+    GRID_ASSERT(err==hipSuccess);
 #endif
 #ifdef GRID_CUDA
     auto err = cudaDeviceSynchronize();
-    assert(err==cudaSuccess);
+    GRID_ASSERT(err==cudaSuccess);
 #endif
 #ifdef GRID_SYCL
     accelerator_barrier();
@@ -156,7 +158,7 @@ public:
 		   deviceVector<ComplexD*> &Cmn,
 		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
   {
-    assert(precision == GridBLAS_PRECISION_DEFAULT);
+    GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
     gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
 		m,n,k,
 		alpha,
@@ -221,11 +223,11 @@ public:
 		   deviceVector<ComplexD*> &Cmn,
 		   GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
   {
-    assert(precision == GridBLAS_PRECISION_DEFAULT);
+    GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
     RealD t2=usecond();
     int32_t batchCount = Amk.size();
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
+    GRID_ASSERT(Bkn.size()==batchCount);
+    GRID_ASSERT(Cmn.size()==batchCount);
 
     //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
     //assert(OpB!=GridBLAS_OP_T);
@@ -254,18 +256,31 @@ public:
     if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
     if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
     if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
     auto err = hipblasZgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
-				   (hipblasDoubleComplex *) &alpha_p[0],
-				   (hipblasDoubleComplex **)&Amk[0], lda,
-				   (hipblasDoubleComplex **)&Bkn[0], ldb,
-				   (hipblasDoubleComplex *) &beta_p[0],
-				   (hipblasDoubleComplex **)&Cmn[0], ldc,
+				   (hipDoubleComplex *) &alpha_p[0],
+				   (hipDoubleComplex **)&Amk[0], lda,
+				   (hipDoubleComplex **)&Bkn[0], ldb,
+				   (hipDoubleComplex *) &beta_p[0],
+				   (hipDoubleComplex **)&Cmn[0], ldc,
 				   batchCount);
+#else
+    auto err = hipblasZgemmBatched(gridblasHandle,
+                                   hOpA,
+                                   hOpB,
+                                   m,n,k,
+                                   (hipblasDoubleComplex *) &alpha_p[0],
+                                   (hipblasDoubleComplex **)&Amk[0], lda,
+                                   (hipblasDoubleComplex **)&Bkn[0], ldb,
+                                   (hipblasDoubleComplex *) &beta_p[0],
+                                   (hipblasDoubleComplex **)&Cmn[0], ldc,
+                                   batchCount);
+#endif
     //	 std::cout << " hipblas return code " <<(int)err<<std::endl;
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     cublasOperation_t hOpA;
@@ -286,7 +301,7 @@ public:
 				  (cuDoubleComplex *) &beta_p[0],
 				  (cuDoubleComplex **)&Cmn[0], ldc,
 				  batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
       int64_t m64=m;
@@ -490,10 +505,10 @@ public:
     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
     RealD t0=usecond();
 
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
+    GRID_ASSERT(Bkn.size()==batchCount);
+    GRID_ASSERT(Cmn.size()==batchCount);
 #ifdef GRID_HIP
-    assert(precision == GridBLAS_PRECISION_DEFAULT);
+    GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
     hipblasOperation_t hOpA;
     hipblasOperation_t hOpB;
     if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
@@ -502,18 +517,31 @@ public:
     if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
     if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
     if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
     auto err = hipblasCgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
-				   (hipblasComplex *) &alpha_p[0],
-				   (hipblasComplex **)&Amk[0], lda,
-				   (hipblasComplex **)&Bkn[0], ldb,
-				   (hipblasComplex *) &beta_p[0],
-				   (hipblasComplex **)&Cmn[0], ldc,
+				   (hipComplex *) &alpha_p[0],
+				   (hipComplex **)&Amk[0], lda,
+				   (hipComplex **)&Bkn[0], ldb,
+				   (hipComplex *) &beta_p[0],
+				   (hipComplex **)&Cmn[0], ldc,
 				   batchCount);
+#else
+    auto err = hipblasCgemmBatched(gridblasHandle,
+                                   hOpA,
+                                   hOpB,
+                                   m,n,k,
+                                   (hipblasComplex *) &alpha_p[0],
+                                   (hipblasComplex **)&Amk[0], lda,
+                                   (hipblasComplex **)&Bkn[0], ldb,
+                                   (hipblasComplex *) &beta_p[0],
+                                   (hipblasComplex **)&Cmn[0], ldc,
+                                   batchCount);
 
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+#endif
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     cublasOperation_t hOpA;
@@ -549,10 +577,10 @@ public:
 				(void **)&Cmn[0], CUDA_C_32F, ldc,
 				batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
     }
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
-    assert(precision == GridBLAS_PRECISION_DEFAULT);
+    GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
     int64_t m64=m;
     int64_t n64=n;
     int64_t k64=k;
@@ -584,7 +612,7 @@ public:
     synchronise();
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
-    assert(precision == GridBLAS_PRECISION_DEFAULT);
+    GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
     // Need a default/reference implementation; use Eigen
       if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
 	thread_for (p, batchCount, {
@@ -681,8 +709,8 @@ public:
     RealD t2=usecond();
     int32_t batchCount = Amk.size();
 
-    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
-    assert(OpB!=GridBLAS_OP_C);
+    GRID_ASSERT(OpA!=GridBLAS_OP_C); // Real case no conjugate
+    GRID_ASSERT(OpB!=GridBLAS_OP_C);
 
     int lda = m; // m x k column major
     int ldb = k; // k x n column major
@@ -698,8 +726,8 @@ public:
     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
     RealD t0=usecond();
 
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
+    GRID_ASSERT(Bkn.size()==batchCount);
+    GRID_ASSERT(Cmn.size()==batchCount);
 #ifdef GRID_HIP
     hipblasOperation_t hOpA;
     hipblasOperation_t hOpB;
@@ -719,7 +747,7 @@ public:
 				   (float *) &beta_p[0],
 				   (float **)&Cmn[0], ldc,
 				   batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     cublasOperation_t hOpA;
@@ -740,7 +768,7 @@ public:
 				  (float *) &beta_p[0],
 				  (float **)&Cmn[0], ldc,
 				  batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
       int64_t m64=m;
@@ -840,8 +868,8 @@ public:
     RealD t2=usecond();
     int32_t batchCount = Amk.size();
 
-    assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
-    assert(OpB!=GridBLAS_OP_C);
+    GRID_ASSERT(OpA!=GridBLAS_OP_C); // Real case no conjugate
+    GRID_ASSERT(OpB!=GridBLAS_OP_C);
 
     int lda = m; // m x k column major
     int ldb = k; // k x n column major
@@ -858,8 +886,8 @@ public:
     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
     RealD t0=usecond();
 
-    assert(Bkn.size()==batchCount);
-    assert(Cmn.size()==batchCount);
+    GRID_ASSERT(Bkn.size()==batchCount);
+    GRID_ASSERT(Cmn.size()==batchCount);
 #ifdef GRID_HIP
     hipblasOperation_t hOpA;
     hipblasOperation_t hOpB;
@@ -879,7 +907,7 @@ public:
 				   (double *) &beta_p[0],
 				   (double **)&Cmn[0], ldc,
 				   batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     cublasOperation_t hOpA;
@@ -900,7 +928,7 @@ public:
 				  (double *) &beta_p[0],
 				  (double **)&Cmn[0], ldc,
 				  batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
       int64_t m64=m;
@@ -1002,7 +1030,7 @@ public:
 		      deviceVector<ComplexD*> &Cnn) {
 
     int64_t batchCount = Ann.size();
-    assert(batchCount == Cnn.size());
+    GRID_ASSERT(batchCount == Cnn.size());
     thread_for(p,batchCount, {
 	Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
 	Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
@@ -1015,7 +1043,7 @@ public:
 		      deviceVector<ComplexF*> &Cnn) {
 
     int64_t batchCount = Ann.size();
-    assert(batchCount == Cnn.size());
+    GRID_ASSERT(batchCount == Cnn.size());
     thread_for(p,batchCount, {
 	Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
 	Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
@@ -1028,7 +1056,7 @@ public:
 			  deviceVector<ComplexD*> &C) {
 
     int64_t batchCount = Ann.size();
-    assert(batchCount == C.size());
+    GRID_ASSERT(batchCount == C.size());
     thread_for(p,batchCount, {
 	Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
 	*C[p] = eAnn.determinant();
@@ -1040,7 +1068,7 @@ public:
 			  deviceVector<ComplexF*> &C) {
 
     int64_t batchCount = Ann.size();
-    assert(batchCount == C.size());
+    GRID_ASSERT(batchCount == C.size());
     thread_for(p,batchCount, {
 	Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
 	*C[p] = eAnn.determinant();
@@ -1089,16 +1117,24 @@ public:
 		    deviceVector<int64_t> &info)
   {
     int64_t batchCount = Ann.size();
-    assert(ipiv.size()==batchCount*n);
-    assert(info.size()==batchCount);
+    GRID_ASSERT(ipiv.size()==batchCount*n);
+    GRID_ASSERT(info.size()==batchCount);
 
 #ifdef GRID_HIP
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
     auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
-				    (hipblasDoubleComplex **)&Ann[0], (int)n,
+				    (hipDoubleComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (int*) &info[0],
 				    (int)batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+#else
+    auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
+                                    (hipblasDoubleComplex **)&Ann[0], (int)n,
+                                    (int*) &ipiv[0],
+                                    (int*) &info[0],
+                                    (int)batchCount);
+#endif
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
@@ -1106,7 +1142,7 @@ public:
 				   (int*) &ipiv[0],
 				   (int*) &info[0],
 				   (int)batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
     getrfBatchedSYCL(n, Ann, ipiv, info);
@@ -1119,16 +1155,25 @@ public:
 		    deviceVector<int64_t> &info)
   {
     int64_t batchCount = Ann.size();
-    assert(ipiv.size()==batchCount*n);
-    assert(info.size()==batchCount);
+    GRID_ASSERT(ipiv.size()==batchCount*n);
+    GRID_ASSERT(info.size()==batchCount);
 
 #ifdef GRID_HIP
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
     auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
-				    (hipblasComplex **)&Ann[0], (int)n,
+				    (hipComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (int*) &info[0],
 				    (int)batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+#else
+    auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
+                                    (hipblasComplex **)&Ann[0], (int)n,
+                                    (int*) &ipiv[0],
+                                    (int*) &info[0],
+                                    (int)batchCount);
+#endif
+
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
@@ -1136,7 +1181,7 @@ public:
 				   (int*) &ipiv[0],
 				   (int*) &info[0],
 				   (int)batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
     getrfBatchedSYCL(n, Ann, ipiv, info);
@@ -1195,18 +1240,28 @@ public:
 		    deviceVector<ComplexD*> &Cnn)
   {
     int64_t batchCount = Ann.size();
-    assert(ipiv.size()==batchCount*n);
-    assert(info.size()==batchCount);
-    assert(Cnn.size()==batchCount);
+    GRID_ASSERT(ipiv.size()==batchCount*n);
+    GRID_ASSERT(info.size()==batchCount);
+    GRID_ASSERT(Cnn.size()==batchCount);
 
 #ifdef GRID_HIP
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
     auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
-				    (hipblasDoubleComplex **)&Ann[0], (int)n,
+				    (hipDoubleComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
-				    (hipblasDoubleComplex **)&Cnn[0], (int)n,
+				    (hipDoubleComplex **)&Cnn[0], (int)n,
 				    (int*) &info[0],
 				    (int)batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+#else
+    auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
+                                    (hipblasDoubleComplex **)&Ann[0], (int)n,
+                                    (int*) &ipiv[0],
+                                    (hipblasDoubleComplex **)&Cnn[0], (int)n,
+                                    (int*) &info[0],
+                                    (int)batchCount);
+
+#endif
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     auto err = cublasZgetriBatched(gridblasHandle, (int)n,
@@ -1215,7 +1270,7 @@ public:
 				   (cuDoubleComplex **)&Cnn[0], (int)n,
 				   (int*) &info[0],
 				   (int)batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
     getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
@@ -1229,18 +1284,27 @@ public:
 		    deviceVector<ComplexF*> &Cnn)
   {
     int64_t batchCount = Ann.size();
-    assert(ipiv.size()==batchCount*n);
-    assert(info.size()==batchCount);
-    assert(Cnn.size()==batchCount);
+    GRID_ASSERT(ipiv.size()==batchCount*n);
+    GRID_ASSERT(info.size()==batchCount);
+    GRID_ASSERT(Cnn.size()==batchCount);
 
 #ifdef GRID_HIP
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
     auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
-				    (hipblasComplex **)&Ann[0], (int)n,
+				    (hipComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
-				    (hipblasComplex **)&Cnn[0], (int)n,
+				    (hipComplex **)&Cnn[0], (int)n,
 				    (int*) &info[0],
 				    (int)batchCount);
-    assert(err==HIPBLAS_STATUS_SUCCESS);
+#else
+    auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
+                                    (hipblasComplex **)&Ann[0], (int)n,
+                                    (int*) &ipiv[0],
+                                    (hipblasComplex **)&Cnn[0], (int)n,
+                                    (int*) &info[0],
+                                    (int)batchCount);
+#endif
+    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
     auto err = cublasCgetriBatched(gridblasHandle, (int)n,
@@ -1249,7 +1313,7 @@ public:
 				   (cuComplex **)&Cnn[0], (int)n,
 				   (int*) &info[0],
 				   (int)batchCount);
-    assert(err==CUBLAS_STATUS_SUCCESS);
+    GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_SYCL
     getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
diff --git a/Grid/algorithms/blas/MomentumProject.h b/Grid/algorithms/blas/MomentumProject.h
new file mode 100644
index 00000000..7af7f022
--- /dev/null
+++ b/Grid/algorithms/blas/MomentumProject.h
@@ -0,0 +1,300 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: MomentumProject.h
+
+    Copyright (C) 2025
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+/* 
+   MultiMomProject
+
+   Import vectors -> nxyz x (ncomponent x nt)
+   Import complex phases -> nmom x nxy
+
+   apply = via (possibly batched) GEMM
+*/
+template<class Field, class ComplexField>
+class MomentumProject
+{
+public:
+
+  typedef typename Field::scalar_type   scalar;
+  typedef typename Field::scalar_object scalar_object;
+
+  GridBase *grid;
+  uint64_t nmom;
+  uint64_t nxyz;
+  uint64_t nt;
+  uint64_t nbtw;
+  uint64_t words;
+
+  deviceVector<scalar> BLAS_V;      // 
+  deviceVector<scalar> BLAS_M;      // 
+  deviceVector<scalar> BLAS_P;      // 
+  
+  MomentumProject(){};
+ ~MomentumProject(){ Deallocate(); };
+  
+  void Deallocate(void)
+  {
+    grid=nullptr;
+    nmom=0;
+    nxyz=0;
+    nt=0;
+    nbtw=0;
+    words=0;
+    BLAS_V.resize(0);
+    BLAS_M.resize(0);
+    BLAS_P.resize(0);
+  }
+  void Allocate(int _nmom,GridBase *_grid)
+  {
+    grid=_grid;
+    Coordinate ldims = grid->LocalDimensions();
+
+    nmom=_nmom;
+    nt   = ldims[grid->Nd()-1];
+    nxyz = grid->lSites()/nt;
+    words = sizeof(scalar_object)/sizeof(scalar);
+    nbtw = nt * words;
+
+    BLAS_V.resize (nxyz * nt * words );
+    BLAS_M.resize (nmom * nxyz       );
+    BLAS_P.resize (nmom * nt * words );
+  }
+  void ImportMomenta(const std::vector <ComplexField> &momenta)
+  {
+    GRID_ASSERT(momenta.size()==nmom);
+    //    might as well just make the momenta here
+    typedef typename Field::vector_object vobj;
+
+    int nd = grid->_ndimension;
+
+    uint64_t sz = BLAS_M.size();
+
+    GRID_ASSERT(momenta.size()==nmom)
+    GRID_ASSERT(momenta[0].Grid()==grid);
+    GRID_ASSERT(sz = nxyz * nmom);
+    
+    Coordinate rdimensions = grid->_rdimensions;
+    Coordinate ldims       = grid->LocalDimensions();
+    int64_t osites         = grid->oSites();
+    Coordinate simd        = grid->_simd_layout;
+    const int Nsimd        = vobj::Nsimd();
+    uint64_t lwords        = words; // local variable for copy in to GPU
+    int64_t Nxyz = nxyz;
+    auto blasData_p  = &BLAS_M[0];
+    for(int m=0;m<momenta.size();m++){
+
+      autoView( Data   , momenta[m], AcceleratorRead);
+      auto Data_p  = &Data[0];
+
+      accelerator_for(xyz,nxyz,1,{
+	  //////////////////////////////////////////
+	  // isite -- map lane within buffer to lane within lattice
+	  ////////////////////////////////////////////
+	    Coordinate lcoor(nd,0);
+	    Lexicographic::CoorFromIndex(lcoor,xyz,ldims);
+	    
+	    Coordinate icoor(nd);
+	    Coordinate ocoor(nd);
+	    for (int d = 0; d < nd; d++) {
+	      icoor[d] = lcoor[d]/rdimensions[d];
+	      ocoor[d] = lcoor[d]%rdimensions[d];
+	    }
+	    int64_t osite;
+	    int64_t isite;
+	    Lexicographic::IndexFromCoor(ocoor,osite,rdimensions);
+	    Lexicographic::IndexFromCoor(icoor,isite,simd);
+	    
+	    // BLAS_M[nmom][slice_vol]
+	    // Fortran Column major BLAS layout is M_xyz,mom
+	    scalar data = extractLane(isite,Data[osite]);
+	    uint64_t idx = xyz+m*Nxyz;
+	    blasData_p[idx] = data;
+	});
+    }
+  }
+  void ImportVector(Field &vec)
+  {
+    typedef typename Field::vector_object vobj;
+
+    int nd = grid->_ndimension;
+
+    uint64_t sz = BLAS_V.size();
+
+    GRID_ASSERT(sz = nxyz * words * nt);
+    
+    Coordinate rdimensions = grid->_rdimensions;
+    Coordinate ldims= grid->LocalDimensions();
+    int64_t osites = grid->oSites();
+    Coordinate simd = grid->_simd_layout;
+    const int Nsimd = vobj::Nsimd();
+    uint64_t lwords= words; // local variable for copy in to GPU
+
+    auto blasData_p  = &BLAS_V[0];
+    autoView( Data   , vec, AcceleratorRead);
+    auto Data_p  = &Data[0];
+
+    int64_t nwords = words;// for capture
+    int64_t Nt     = nt;// for capture
+    
+    accelerator_for(sf,osites,Nsimd,{
+#ifdef GRID_SIMT
+        {
+	  int lane=acceleratorSIMTlane(Nsimd); // buffer lane
+#else
+	  for(int lane=0;lane<Nsimd;lane++) {
+#endif
+	  //////////////////////////////////////////
+	  // isite -- map lane within buffer to lane within lattice
+	  ////////////////////////////////////////////
+	    Coordinate lcoor(nd,0);
+	    Coordinate icoor(nd);
+	    Coordinate ocoor(nd);
+	    
+	    Lexicographic::CoorFromIndex(icoor,lane,simd);
+	    Lexicographic::CoorFromIndex(ocoor,sf,rdimensions);
+
+	  
+	    int64_t l_xyz = 0;
+	    for (int d = 0; d < nd; d++) {
+	      lcoor[d] = rdimensions[d]*icoor[d] + ocoor[d];
+	    }
+	    uint64_t l_t   = lcoor[nd-1];
+
+	    Coordinate xyz_coor = lcoor;
+	    xyz_coor[nd-1] =0;
+	    Lexicographic::IndexFromCoor(xyz_coor,l_xyz,ldims);
+
+	    
+	    scalar_object data = extractLane(lane,Data[sf]);
+	    scalar *data_words = (scalar *) &data;
+	    for(int w = 0 ; w < nwords; w++) {
+	      // BLAS_V[slice_vol][nt][words]
+	      // Fortran Column major BLAS layout is V_(t,w)_xyz
+	      uint64_t idx = w+l_t*nwords + l_xyz * nwords * Nt;
+	      blasData_p[idx] = data_words[w];
+	    }
+#ifdef GRID_SIMT
+	}
+#else
+	}
+#endif
+	});
+  }
+  void ExportMomentumProjection(std::vector<typename Field::scalar_object> &projection)
+  {
+    projection.resize(nmom*nt);
+    acceleratorCopyFromDevice(&BLAS_P[0],(scalar *)&projection[0],BLAS_P.size()*sizeof(scalar));
+    // Could decide on a layout late?
+  }
+
+  // Row major layout "C" order:
+  // BLAS_V[slice_vol][nt][words]
+  // BLAS_M[nmom][slice_vol]
+  // BLAS_P[nmom][nt][words]
+  //
+  // Fortran Column major BLAS layout is V_(w,t)_xyz
+  // Fortran Column major BLAS layout is M_xyz,mom
+  // Fortran Column major BLAS layout is P_(w,t),mom
+  //
+  // Projected
+  //
+  // P = (V * M)_(w,t),mom
+  //
+  void Project(Field &data,std::vector< typename Field::scalar_object > & projected_gdata)
+  {
+    double t_import=0;
+    double t_export=0;
+    double t_gemm  =0;
+    double t_allreduce=0;
+    t_import-=usecond();
+    this->ImportVector(data);
+
+    std::vector< typename Field::scalar_object > projected_planes;
+
+    deviceVector<scalar *> Vd(1);
+    deviceVector<scalar *> Md(1);
+    deviceVector<scalar *> Pd(1);
+
+    scalar * Vh = & BLAS_V[0];
+    scalar * Mh = & BLAS_M[0];
+    scalar * Ph = & BLAS_P[0];
+
+    acceleratorPut(Vd[0],Vh);
+    acceleratorPut(Md[0],Mh);
+    acceleratorPut(Pd[0],Ph);
+    t_import+=usecond();
+
+    GridBLAS BLAS;
+
+    /////////////////////////////////////////
+    // P_im = VMmx . Vxi
+    /////////////////////////////////////////
+    t_gemm-=usecond();
+    BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, 
+    		     words*nt,nmom,nxyz,
+		     scalar(1.0),
+		     Vd,
+		     Md,
+		     scalar(0.0),  // wipe out result
+		     Pd);
+    BLAS.synchronise();
+    t_gemm+=usecond();
+
+    t_export-=usecond();
+    ExportMomentumProjection(projected_planes); // resizes
+    t_export+=usecond();
+
+    /////////////////////////////////
+    // Reduce across MPI ranks
+    /////////////////////////////////
+    int nd = grid->Nd();
+    int gt = grid->GlobalDimensions()[nd-1];
+    int lt = grid->LocalDimensions()[nd-1];
+    projected_gdata.resize(gt*nmom);
+    for(int t=0;t<gt*nmom;t++){ // global Nt array with zeroes for stuff not on this node
+      projected_gdata[t]=Zero();
+    }
+    for(int t=0;t<lt;t++){
+    for(int m=0;m<nmom;m++){
+      int st = grid->LocalStarts()[nd-1];
+      projected_gdata[t+st + gt*m] = projected_planes[t+lt*m];
+    }}
+    t_allreduce-=usecond();
+    grid->GlobalSumVector((scalar *)&projected_gdata[0],gt*nmom*words);
+    t_allreduce+=usecond();
+
+    std::cout << GridLogPerformance<<" MomentumProject t_import  "<<t_import<<"us"<<std::endl;
+    std::cout << GridLogPerformance<<" MomentumProject t_export  "<<t_export<<"us"<<std::endl;
+    std::cout << GridLogPerformance<<" MomentumProject t_gemm    "<<t_gemm<<"us"<<std::endl;
+    std::cout << GridLogPerformance<<" MomentumProject t_reduce  "<<t_allreduce<<"us"<<std::endl;
+
+  }
+};
+
+NAMESPACE_END(Grid);
diff --git a/Grid/algorithms/deflation/Deflation.h b/Grid/algorithms/deflation/Deflation.h
index 1a8f97c9..8dad88ae 100644
--- a/Grid/algorithms/deflation/Deflation.h
+++ b/Grid/algorithms/deflation/Deflation.h
@@ -69,8 +69,8 @@ public:
   DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
   : evec(_evec), eval(_eval), N(_N)
   {
-    assert(evec.size()==eval.size());
-    assert(N <= evec.size());
+    GRID_ASSERT(evec.size()==eval.size());
+    GRID_ASSERT(N <= evec.size());
   } 
 
   virtual void operator()(const Field &src,Field &guess) {
@@ -141,11 +141,10 @@ public:
     }
     //postprocessing
     std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
-    for (int j=0;j<Nsrc;j++)
-    {
-    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
-    blockPromote(guess_coarse[j],guess[j],subspace);
-    guess[j].Checkerboard() = src[j].Checkerboard();
+    for (int j=0;j<Nsrc;j++) {
+      std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
+      blockPromote(guess_coarse[j],guess[j],subspace);
+      guess[j].Checkerboard() = src[j].Checkerboard();
     }
   };
 
diff --git a/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h b/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
index eeeb0424..e16d95bf 100644
--- a/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
+++ b/Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
@@ -160,7 +160,7 @@ public:
     uint64_t words;
 
     nrhs = X.size();
-    assert(X.size()==Y.size());
+    GRID_ASSERT(X.size()==Y.size());
     conformable(X[0],Y[0]);
 
     grid  = X[0].Grid();
@@ -259,7 +259,7 @@ public:
     uint64_t words;
 
     nrhs = X.size();
-    assert(X.size()==Y.size());
+    GRID_ASSERT(X.size()==Y.size());
     conformable(X[0],Y[0]);
 
     grid  = X[0].Grid();
@@ -267,7 +267,7 @@ public:
     vol   = grid->oSites()/rd0;
     words = rd0*sizeof(vector_object)/sizeof(scalar);
     int64_t vw = vol * words;
-    assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
+    GRID_ASSERT(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
 
     RealD t0 = usecond();
     BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
diff --git a/Grid/algorithms/deflation/MultiRHSBlockProject.h b/Grid/algorithms/deflation/MultiRHSBlockProject.h
index d9b4acb5..8212189c 100644
--- a/Grid/algorithms/deflation/MultiRHSBlockProject.h
+++ b/Grid/algorithms/deflation/MultiRHSBlockProject.h
@@ -131,12 +131,12 @@ public:
     typedef typename Field::vector_object vobj;
     //    std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
 
-    assert(vecs[0].Grid()==fine_grid);
+    GRID_ASSERT(vecs[0].Grid()==fine_grid);
 
     subdivides(coarse_grid,fine_grid); // require they map
 
     int _ndimension = coarse_grid->_ndimension;
-    assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
+    GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites());
     
     Coordinate  block_r      (_ndimension);
     for(int d=0 ; d<_ndimension;d++){
@@ -164,7 +164,7 @@ public:
       const int Nsimd = vobj::Nsimd();
       //      std::cout << "sz "<<sz<<std::endl;
       //      std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
-      assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
+      GRID_ASSERT(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
       uint64_t lwords= words; // local variable for copy in to GPU
       accelerator_for(sf,osites,Nsimd,{
 #ifdef GRID_SIMT
@@ -198,7 +198,7 @@ public:
    	               + v*bv
 	               + sb;
 
-	  //	  assert(site*lwords<sz);
+	  //	  GRID_ASSERT(site*lwords<sz);
 
 	  scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
 
@@ -219,12 +219,12 @@ public:
 
     int nvec = vecs.size();
 
-    assert(vecs[0].Grid()==fine_grid);
+    GRID_ASSERT(vecs[0].Grid()==fine_grid);
 
     subdivides(coarse_grid,fine_grid); // require they map
 
     int _ndimension = coarse_grid->_ndimension;
-    assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
+    GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites());
     
     Coordinate  block_r      (_ndimension);
     for(int d=0 ; d<_ndimension;d++){
@@ -299,7 +299,7 @@ public:
 
     //    std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
 
-    assert(vecs[0].Grid()==coarse_grid);
+    GRID_ASSERT(vecs[0].Grid()==coarse_grid);
 
     int _ndimension = coarse_grid->_ndimension;
 
@@ -320,7 +320,7 @@ public:
       // loop over fine sites
       const int Nsimd = vobj::Nsimd();
       uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
-      assert(cwords==nbasis);
+      GRID_ASSERT(cwords==nbasis);
       
       accelerator_for(sc,osites,Nsimd,{
 #ifdef GRID_SIMT
@@ -353,7 +353,7 @@ public:
     typedef typename vobj::scalar_object coarse_scalar_object;
     //    std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
 
-    assert(vecs[0].Grid()==coarse_grid);
+    GRID_ASSERT(vecs[0].Grid()==coarse_grid);
 
     int _ndimension = coarse_grid->_ndimension;
     
@@ -375,7 +375,7 @@ public:
       // loop over fine sites
       const int Nsimd = vobj::Nsimd();
       uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
-      assert(cwords==nbasis);
+      GRID_ASSERT(cwords==nbasis);
       
       accelerator_for(sc,osites,Nsimd,{
 	  // Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
@@ -409,7 +409,7 @@ public:
     int nrhs=fine.size();
     int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
     //    std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
-    assert(nbasis==_nbasis);
+    GRID_ASSERT(nbasis==_nbasis);
     
     BLAS_F.resize (fine_vol * words * nrhs );
     BLAS_C.resize (coarse_vol * nbasis * nrhs );
@@ -464,7 +464,7 @@ public:
   {
     int nrhs=fine.size();
     int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
-    assert(nbasis==_nbasis);
+    GRID_ASSERT(nbasis==_nbasis);
     
     BLAS_F.resize (fine_vol * words * nrhs );
     BLAS_C.resize (coarse_vol * nbasis * nrhs );
diff --git a/Grid/algorithms/deflation/MultiRHSDeflation.h b/Grid/algorithms/deflation/MultiRHSDeflation.h
index 9f20bc50..069390f4 100644
--- a/Grid/algorithms/deflation/MultiRHSDeflation.h
+++ b/Grid/algorithms/deflation/MultiRHSDeflation.h
@@ -98,7 +98,7 @@ public:
   void ImportEigenVector(Field &evec,RealD &_eval, int ev)
   {
     //    std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
-    assert(ev<eval.size());
+    GRID_ASSERT(ev<eval.size());
     eval[ev] = _eval;
 
     int64_t offset = ev*vol*words;
@@ -113,7 +113,7 @@ public:
   // Could use to import a batch of eigenvectors
   void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
   {
-    assert(_ev0+_nev<=evec.size());
+    GRID_ASSERT(_ev0+_nev<=evec.size());
 
     Allocate(_nev,evec[0].Grid());
     
@@ -126,8 +126,8 @@ public:
   void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
   {
     int nrhs = source.size();
-    assert(source.size()==guess.size());
-    assert(grid == guess[0].Grid());
+    GRID_ASSERT(source.size()==guess.size());
+    GRID_ASSERT(grid == guess[0].Grid());
     conformable(guess[0],source[0]);
 
     int64_t vw = vol * words;
@@ -189,7 +189,7 @@ public:
 		     Cd);
     BLAS.synchronise();
 
-    assert(BLAS_C.size()==nev*nrhs);
+    GRID_ASSERT(BLAS_C.size()==nev*nrhs);
 
     std::vector<scalar> HOST_C(BLAS_C.size());      // nrhs . nev -- the coefficients 
     acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
diff --git a/Grid/algorithms/iterative/AdefGeneric.h b/Grid/algorithms/iterative/AdefGeneric.h
index e0b99fcb..9ae1e611 100644
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@@ -270,7 +270,7 @@ class TwoLevelCG : public LinearFunction<Field>
     std::vector<RealD> src_nrm(nrhs);
     for(int rhs=0;rhs<nrhs;rhs++) {
       src_nrm[rhs]=norm2(src[rhs]);
-      assert(src_nrm[rhs]!=0.0);
+      GRID_ASSERT(src_nrm[rhs]!=0.0);
     }
     std::vector<RealD> tn(nrhs);
 
diff --git a/Grid/algorithms/iterative/AdefMrhs.h b/Grid/algorithms/iterative/AdefMrhs.h
index 810d7391..e2090009 100644
--- a/Grid/algorithms/iterative/AdefMrhs.h
+++ b/Grid/algorithms/iterative/AdefMrhs.h
@@ -92,8 +92,8 @@ class TwoLevelCGmrhs
   // Vector case
   virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
   {
-    //    SolveSingleSystem(src,x);
-    SolvePrecBlockCG(src,x);
+    SolveSingleSystem(src,x);
+	// SolvePrecBlockCG(src,x);
   }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -161,7 +161,7 @@ class TwoLevelCGmrhs
     ////////////////////////////////////////////
     std::vector<RealD> ssq(nrhs);
     for(int rhs=0;rhs<nrhs;rhs++){
-      ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
+      ssq[rhs]=norm2(src[rhs]); GRID_ASSERT(ssq[rhs]!=0.0);
     }      
 
     ///////////////////////////
@@ -382,7 +382,7 @@ class TwoLevelCGmrhs
     }
     HDCGTimer.Stop();
     std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
-    assert(0);
+    GRID_ASSERT(0);
   }
 
   virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
@@ -415,7 +415,7 @@ class TwoLevelCGmrhs
     std::vector<RealD> src_nrm(nrhs);
     for(int rhs=0;rhs<nrhs;rhs++) {
       src_nrm[rhs]=norm2(src[rhs]);
-      assert(src_nrm[rhs]!=0.0);
+      GRID_ASSERT(src_nrm[rhs]!=0.0);
     }
     std::vector<RealD> tn(nrhs);
 
diff --git a/Grid/algorithms/iterative/BiCGSTAB.h b/Grid/algorithms/iterative/BiCGSTAB.h
index f4e5cdda..d5f8c359 100644
--- a/Grid/algorithms/iterative/BiCGSTAB.h
+++ b/Grid/algorithms/iterative/BiCGSTAB.h
@@ -47,7 +47,7 @@ class BiCGSTAB : public OperatorFunction<Field>
   public:
     using OperatorFunction<Field>::operator();
     
-    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+    bool ErrorOnNoConverge;  // throw an GRID_ASSERT when the CG fails to converge.
                              // Defaults true.
     RealD Tolerance;
     Integer MaxIterations;
@@ -77,7 +77,7 @@ class BiCGSTAB : public OperatorFunction<Field>
 
       // Initial residual computation & set up
       RealD guess = norm2(psi);
-      assert(std::isnan(guess) == 0);
+      GRID_ASSERT(std::isnan(guess) == 0);
     
       Linop.Op(psi, v);
       b = norm2(v);
@@ -214,7 +214,7 @@ class BiCGSTAB : public OperatorFunction<Field>
           std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() << std::endl;
           std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
 
-          if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); }
+          if(ErrorOnNoConverge){ GRID_ASSERT(true_residual / Tolerance < 10000.0); }
 
           IterationsToComplete = k;	
 
@@ -224,7 +224,7 @@ class BiCGSTAB : public OperatorFunction<Field>
       
       std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
 
-      if(ErrorOnNoConverge){ assert(0); }
+      if(ErrorOnNoConverge){ GRID_ASSERT(0); }
       IterationsToComplete = k;
     }
 };
diff --git a/Grid/algorithms/iterative/BlockConjugateGradient.h b/Grid/algorithms/iterative/BlockConjugateGradient.h
index d194bb06..2497dbf3 100644
--- a/Grid/algorithms/iterative/BlockConjugateGradient.h
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@@ -98,7 +98,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
   int Nblock;
 
   BlockCGtype CGtype;
-  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+  bool ErrorOnNoConverge;  // throw an GRID_ASSERT when the CG fails to converge.
                            // Defaults true.
   RealD Tolerance;
   Integer MaxIterations;
@@ -201,7 +201,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
   } else if (CGtype == CGmultiRHS ) {
     CGmultiRHSsolve(Linop,Src,Psi);
   } else {
-    assert(0);
+    GRID_ASSERT(0);
   }
 }
 virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) 
@@ -209,7 +209,7 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
   if ( CGtype == BlockCGrQVec ) {
     BlockCGrQsolveVec(Linop,Src,Psi);
   } else {
-    assert(0);
+    GRID_ASSERT(0);
   }
 }
 
@@ -259,10 +259,10 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
   for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
 
   sliceNorm(residuals,B,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+  for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
 
   sliceNorm(residuals,X,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+  for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
 
   /************************************************************************
    * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -402,7 +402,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
   std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
 	    <<" residual "<< std::sqrt(max_resid)<< std::endl;
 
-  if (ErrorOnNoConverge) assert(0);
+  if (ErrorOnNoConverge) GRID_ASSERT(0);
   IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
@@ -438,10 +438,10 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
   for(int b=0;b<Nblock;b++) sssum+=ssq[b];
 
   sliceNorm(residuals,Src,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+  for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
 
   sliceNorm(residuals,Psi,Orthog);
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+  for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
 
   // Initial search dir is guess
   Linop.HermOp(Psi, AP);
@@ -540,7 +540,7 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
   }
   std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
 
-  if (ErrorOnNoConverge) assert(0);
+  if (ErrorOnNoConverge) GRID_ASSERT(0);
   IterationsToComplete = k;
 }
 
@@ -554,7 +554,7 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
 void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) 
 {
   Nblock = B.size();
-  assert(Nblock == X.size());
+  GRID_ASSERT(Nblock == X.size());
 
   std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
 
@@ -594,10 +594,10 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
   for(int b=0;b<Nblock;b++) sssum+=ssq[b];
 
   for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+  for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
 
   for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
-  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+  for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
 
   /************************************************************************
    * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -731,7 +731,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
   }
   std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
 
-  if (ErrorOnNoConverge) assert(0);
+  if (ErrorOnNoConverge) GRID_ASSERT(0);
   IterationsToComplete = k;
 }
 
diff --git a/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h b/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
index 6e206d59..537e8b1a 100644
--- a/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
@@ -36,7 +36,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
  public:
   using OperatorFunction<Field>::operator();
 
-  bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
+  bool ErrorOnNoConverge; // Throw an GRID_ASSERT when CAGMRES fails to converge,
                           // defaults to true
 
   RealD   Tolerance;
@@ -82,7 +82,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
     conformable(psi, src);
 
     RealD guess = norm2(psi);
-    assert(std::isnan(guess) == 0);
+    GRID_ASSERT(std::isnan(guess) == 0);
 
     RealD cp;
     RealD ssq = norm2(src);
@@ -137,7 +137,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
     std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
 
     if (ErrorOnNoConverge)
-      assert(0);
+      GRID_ASSERT(0);
   }
 
   RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -185,7 +185,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
       }
     }
 
-    assert(0); // Never reached
+    GRID_ASSERT(0); // Never reached
     return cp;
   }
 
diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h
index 65a77d83..75ed11f1 100644
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -45,7 +45,7 @@ public:
 
   using OperatorFunction<Field>::operator();
   
-  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+  bool ErrorOnNoConverge;  // throw an GRID_ASSERT when the CG fails to converge.
                            // Defaults true.
   RealD Tolerance;
   Integer MaxIterations;
@@ -94,7 +94,7 @@ public:
     ssq = norm2(src);
     RealD guess = norm2(psi);
     NormTimer.Stop();
-    assert(std::isnan(guess) == 0);
+    GRID_ASSERT(std::isnan(guess) == 0);
     AssignTimer.Start();
     if ( guess == 0.0 ) {
       r = src;
@@ -222,7 +222,7 @@ public:
 
 	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
 
-        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
+        if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0);
 
 	IterationsToComplete = k;	
 	TrueResidual = true_residual;
@@ -251,7 +251,7 @@ public:
     std::cout << GridLogPerformance << "\t\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
     std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
 
-    if (ErrorOnNoConverge) assert(0);
+    if (ErrorOnNoConverge) GRID_ASSERT(0);
     IterationsToComplete = k;
 
   }
diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
index 93f5c677..32df0302 100644
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
@@ -77,7 +77,7 @@ public:
   }
 
   void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
-    assert(src_d_in.size() == sol_d.size());
+    GRID_ASSERT(src_d_in.size() == sol_d.size());
     int NBatch = src_d_in.size();
 
     std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
index e00e94c9..4e84593b 100644
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -98,9 +98,9 @@ public:
     std::vector<RealD> alpha(nshift,1.0);
     std::vector<Field>   ps(nshift,grid);// Search directions
 
-    assert(psi.size()==nshift);
-    assert(mass.size()==nshift);
-    assert(mresidual.size()==nshift);
+    GRID_ASSERT(psi.size()==nshift);
+    GRID_ASSERT(mass.size()==nshift);
+    GRID_ASSERT(mresidual.size()==nshift);
   
     // remove dynamic sized arrays on stack; 2d is a pain with vector
     std::vector<RealD>  bs(nshift);
@@ -122,7 +122,7 @@ public:
   
     // Check lightest mass
     for(int s=0;s<nshift;s++){
-      assert( mass[s]>= mass[primary] );
+      GRID_ASSERT( mass[s]>= mass[primary] );
       converged[s]=0;
     }
   
@@ -338,7 +338,7 @@ public:
     }
     // ugly hack
     std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-    //  assert(0);
+    //  GRID_ASSERT(0);
   }
 
 };
diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
index c6102eb2..1ea2e964 100644
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
@@ -118,9 +118,9 @@ public:
     FieldF r_f(SinglePrecGrid);
     FieldD mmp_d(DoublePrecGrid);
 
-    assert(psi_d.size()==nshift);
-    assert(mass.size()==nshift);
-    assert(mresidual.size()==nshift);
+    GRID_ASSERT(psi_d.size()==nshift);
+    GRID_ASSERT(mass.size()==nshift);
+    GRID_ASSERT(mresidual.size()==nshift);
   
     // dynamic sized arrays on stack; 2d is a pain with vector
     std::vector<RealD>  bs(nshift);
@@ -141,7 +141,7 @@ public:
 
     // Check lightest mass
     for(int s=0;s<nshift;s++){
-      assert( mass[s]>= mass[primary] );
+      GRID_ASSERT( mass[s]>= mass[primary] );
       converged[s]=0;
     }
   
@@ -179,7 +179,7 @@ public:
     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
     tmp_d = tmp_d - mmp_d;
     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
-    //    assert(norm2(tmp_d)< 1.0e-4);
+    //    GRID_ASSERT(norm2(tmp_d)< 1.0e-4);
 
     axpy(mmp_d,mass[0],p_d,mmp_d);
     RealD rn = norm2(p_d);
@@ -365,7 +365,7 @@ public:
    
     }
     std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-    assert(0);
+    GRID_ASSERT(0);
   }
 
 };
diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
index 24a3228a..bbed0650 100644
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -48,12 +48,12 @@ public:
 
   ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
 
-  void OpDiag (const Field &in, Field &out){ assert(0); }
-  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); }
+  void OpDiag (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ GRID_ASSERT(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ GRID_ASSERT(0); }
   
-  void Op     (const Field &in, Field &out){ assert(0); }
-  void AdjOp  (const Field &in, Field &out){ assert(0); }
+  void Op     (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void AdjOp  (const Field &in, Field &out){ GRID_ASSERT(0); }
 
   void HermOp(const Field &in, Field &out){
     linop_base.HermOp(in, out);
@@ -151,9 +151,9 @@ public:
     FieldD r_d(DoublePrecGrid);
     FieldD mmp_d(DoublePrecGrid);
 
-    assert(psi_d.size()==nshift);
-    assert(mass.size()==nshift);
-    assert(mresidual.size()==nshift);
+    GRID_ASSERT(psi_d.size()==nshift);
+    GRID_ASSERT(mass.size()==nshift);
+    GRID_ASSERT(mresidual.size()==nshift);
   
     // dynamic sized arrays on stack; 2d is a pain with vector
     std::vector<RealD>  bs(nshift);
@@ -174,7 +174,7 @@ public:
 
     // Check lightest mass
     for(int s=0;s<nshift;s++){
-      assert( mass[s]>= mass[primary] );
+      GRID_ASSERT( mass[s]>= mass[primary] );
       converged[s]=0;
     }
   
@@ -211,7 +211,7 @@ public:
     Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p        d=real(dot(p, mmp)),  qq=norm2(mmp)
     tmp_d = tmp_d - mmp_d;
     std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
-    assert(norm2(tmp_d)< 1.0);
+    GRID_ASSERT(norm2(tmp_d)< 1.0);
 
     axpy(mmp_d,mass[0],p_d,mmp_d);
     RealD rn = norm2(p_d);
@@ -408,7 +408,7 @@ public:
    
     }
     std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-    assert(0);
+    GRID_ASSERT(0);
   }
 
 };
diff --git a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
index 514800fc..479257fd 100644
--- a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
+++ b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
@@ -35,7 +35,7 @@ template<class FieldD,class FieldF,
 	 typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
 public:
-  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+  bool ErrorOnNoConverge;  // throw an GRID_ASSERT when the CG fails to converge.
   // Defaults true.
   RealD Tolerance;
   Integer MaxIterations;
@@ -66,7 +66,7 @@ public:
       DoFinalCleanup(true),
       Linop_fallback(NULL)
   {
-    assert(Delta > 0. && Delta < 1. && "Expect  0 < Delta < 1");
+    GRID_ASSERT(Delta > 0. && Delta < 1. && "Expect  0 < Delta < 1");
   };
 
   void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
@@ -90,7 +90,7 @@ public:
 
     // Initial residual computation & set up
     RealD guess = norm2(psi);
-    assert(std::isnan(guess) == 0);
+    GRID_ASSERT(std::isnan(guess) == 0);
     
     Linop_d.HermOpAndNorm(psi, mmp, d, b);
     
@@ -217,7 +217,7 @@ public:
 	  CG(Linop_d,src,psi);
 	  IterationsToCleanup = CG.IterationsToComplete;
 	}
-	else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
+	else if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0);
 
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
 	return;
@@ -263,7 +263,7 @@ public:
     std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
 	      << std::endl;
       
-    if (ErrorOnNoConverge) assert(0);
+    if (ErrorOnNoConverge) GRID_ASSERT(0);
     IterationsToComplete = k;
     ReliableUpdatesPerformed = l;      
   }    
diff --git a/Grid/algorithms/iterative/ConjugateResidual.h b/Grid/algorithms/iterative/ConjugateResidual.h
index e0c3b69d..ece1b485 100644
--- a/Grid/algorithms/iterative/ConjugateResidual.h
+++ b/Grid/algorithms/iterative/ConjugateResidual.h
@@ -106,7 +106,7 @@ public:
     }
 
     std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
-    assert(0);
+    GRID_ASSERT(0);
   }
 };
 NAMESPACE_END(Grid);
diff --git a/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h b/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
index 33bebcbb..580df65a 100644
--- a/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
@@ -36,7 +36,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
  public:
   using OperatorFunction<Field>::operator();
 
-  bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
+  bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FCAGMRES fails to converge,
                           // defaults to true
 
   RealD   Tolerance;
@@ -87,7 +87,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
     conformable(psi, src);
 
     RealD guess = norm2(psi);
-    assert(std::isnan(guess) == 0);
+    GRID_ASSERT(std::isnan(guess) == 0);
 
     RealD cp;
     RealD ssq = norm2(src);
@@ -144,7 +144,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
     std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
 
     if (ErrorOnNoConverge)
-      assert(0);
+      GRID_ASSERT(0);
   }
 
   RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -191,7 +191,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
       }
     }
 
-    assert(0); // Never reached
+    GRID_ASSERT(0); // Never reached
     return cp;
   }
 
diff --git a/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h b/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
index cf108846..2f0de242 100644
--- a/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
@@ -36,7 +36,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
  public:
   using OperatorFunction<Field>::operator();
 
-  bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
+  bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FGMRES fails to converge,
                           // defaults to true
 
   RealD   Tolerance;
@@ -85,7 +85,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
     conformable(psi, src);
 
     RealD guess = norm2(psi);
-    assert(std::isnan(guess) == 0);
+    GRID_ASSERT(std::isnan(guess) == 0);
 
     RealD cp;
     RealD ssq = norm2(src);
@@ -142,7 +142,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
     std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
 
     if (ErrorOnNoConverge)
-      assert(0);
+      GRID_ASSERT(0);
   }
 
   RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -189,7 +189,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
       }
     }
 
-    assert(0); // Never reached
+    GRID_ASSERT(0); // Never reached
     return cp;
   }
 
diff --git a/Grid/algorithms/iterative/GeneralisedMinimalResidual.h b/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
index 0596e91e..98b891bd 100644
--- a/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
@@ -36,7 +36,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
  public:
   using OperatorFunction<Field>::operator();
 
-  bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
+  bool ErrorOnNoConverge; // Throw an GRID_ASSERT when GMRES fails to converge,
                           // defaults to true
 
   RealD   Tolerance;
@@ -80,7 +80,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
     conformable(psi, src);
 
     RealD guess = norm2(psi);
-    assert(std::isnan(guess) == 0);
+    GRID_ASSERT(std::isnan(guess) == 0);
 
     RealD cp;
     RealD ssq = norm2(src);
@@ -135,7 +135,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
     std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
 
     if (ErrorOnNoConverge)
-      assert(0);
+      GRID_ASSERT(0);
   }
 
   RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -181,7 +181,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
       }
     }
 
-    assert(0); // Never reached
+    GRID_ASSERT(0); // Never reached
     return cp;
   }
 
diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
index c5d00722..8e0641f4 100644
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
@@ -175,7 +175,7 @@ public:
       eresid(_eresid),  MaxIter(_MaxIter),
       diagonalisation(_diagonalisation),split_test(0),
       Nevec_acc(_Nu)
-  { assert( (Nk%Nu==0) && (Nm%Nu==0) ); };
+  { GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); };
 
   ////////////////////////////////
   // Helpers
@@ -206,7 +206,7 @@ public:
           Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
       }
     }
-    assert(normalize(w,if_print) != 0);
+    GRID_ASSERT(normalize(w,if_print) != 0);
   }
   void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
   {
@@ -225,7 +225,7 @@ public:
       w[i] = w[i] - ip * evec[j];
     }}
     for(int i=0; i<_Nu; ++i)
-    assert(normalize(w[i],if_print) !=0);
+    GRID_ASSERT(normalize(w[i],if_print) !=0);
   }
 
 
@@ -244,7 +244,7 @@ public:
     const uint64_t sites = grid->lSites();
 
     int Nbatch = R/Nevec_acc;
-    assert( R%Nevec_acc == 0 );
+    GRID_ASSERT( R%Nevec_acc == 0 );
 //    Glog << "nBatch, Nevec_acc, R, Nu = " 
 //         << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl;
     
@@ -302,7 +302,7 @@ public:
       }
     }
     for (int i=0; i<Nu; ++i) {
-      assert(normalize(w[i],do_print)!=0);
+      GRID_ASSERT(normalize(w[i],do_print)!=0);
     }
     
     Glog << "cuBLAS Zgemm done"<< std::endl;
@@ -374,8 +374,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
   {
     std::string fname = std::string(cname+"::calc_irbl()"); 
     GridBase *grid = evec[0].Grid();
-    assert(grid == src[0].Grid());
-    assert( Nu = src.size() );
+    GRID_ASSERT(grid == src[0].Grid());
+    GRID_ASSERT( Nu = src.size() );
     
     Glog << std::string(74,'*') << std::endl;
     Glog << fname + " starting iteration 0 /  "<< MaxIter<< std::endl;
@@ -396,7 +396,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
     }
     Glog << std::string(74,'*') << std::endl;
     
-    assert(Nm == evec.size() && Nm == eval.size());
+    GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
 
     std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));  
     std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));  
@@ -579,8 +579,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
   {
     std::string fname = std::string(cname+"::calc_rbl()"); 
     GridBase *grid = evec[0].Grid();
-    assert(grid == src[0].Grid());
-    assert( Nu = src.size() );
+    GRID_ASSERT(grid == src[0].Grid());
+    GRID_ASSERT( Nu = src.size() );
 
     int Np = (Nm-Nk);
     if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -607,7 +607,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
     }
     Glog << std::string(74,'*') << std::endl;
     
-    assert(Nm == evec.size() && Nm == eval.size());
+    GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
 	
     std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));  
     std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));  
@@ -785,7 +785,7 @@ private:
     
     int Nu = w.size();
     int Nm = evec.size();
-    assert( b < Nm/Nu );
+    GRID_ASSERT( b < Nm/Nu );
 //    GridCartesian *grid = evec[0]._grid;
     
     // converts block index to full indicies for an interval [L,R)
@@ -796,7 +796,7 @@ private:
 
     Glog << "Using split grid"<< std::endl;
 //   LatticeGaugeField s_Umu(SGrid);
-   assert((Nu%mrhs)==0);
+   GRID_ASSERT((Nu%mrhs)==0);
    std::vector<Field>   in(mrhs,f_grid);
      
     Field s_in(sf_grid);
@@ -906,7 +906,7 @@ if(split_test){
     
     for (int u=0; u<Nu; ++u) {
 //      Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
-      assert (!isnan(norm2(w[u])));
+      GRID_ASSERT (!isnan(norm2(w[u])));
       for (int k=L+u; k<R; ++k) {
         Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
       }
@@ -929,8 +929,8 @@ if(split_test){
 			 Eigen::MatrixXcd & Qt, // Nm x Nm
 			 GridBase *grid)
   {
-    assert( Nk%Nu == 0 && Nm%Nu == 0 );
-    assert( Nk <= Nm );
+    GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
+    GRID_ASSERT( Nk <= Nm );
     Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
     
     for ( int u=0; u<Nu; ++u ) {
@@ -970,8 +970,8 @@ if(split_test){
 			 GridBase *grid)
   {
     Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
-    assert( Nk%Nu == 0 && Nm%Nu == 0 );
-    assert( Nk <= Nm );
+    GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
+    GRID_ASSERT( Nk <= Nm );
     Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
     
     for ( int u=0; u<Nu; ++u ) {
@@ -1119,7 +1119,7 @@ if (1){
       diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
 #endif
     } else { 
-      assert(0);
+      GRID_ASSERT(0);
     }
   }
   
@@ -1131,8 +1131,8 @@ if (1){
          Eigen::MatrixXcd& M)
   {
     //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
-    assert( Nk%Nu == 0 && Nm%Nu == 0 );
-    assert( Nk <= Nm );
+    GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
+    GRID_ASSERT( Nk <= Nm );
     M = Eigen::MatrixXcd::Zero(Nk,Nk);
     
     // rearrange 
@@ -1159,8 +1159,8 @@ if (1){
          Eigen::MatrixXcd& M)
   {
     //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
-    assert( Nk%Nu == 0 && Nm%Nu == 0 );
-    assert( Nk <= Nm );
+    GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
+    GRID_ASSERT( Nk <= Nm );
     
     // rearrange 
     for ( int u=0; u<Nu; ++u ) {
diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
index 4c7db462..911a5dd0 100644
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h
@@ -121,7 +121,7 @@ public:
       eresid(_eresid),  MaxIter(_MaxIter),
       diagonalisation(_diagonalisation),
       Nevec_acc(_Nu)
-  { assert( (Nk%Nu==0) && (Nm%Nu==0) ); };
+  { GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); };
 
   ////////////////////////////////
   // Helpers
@@ -151,7 +151,7 @@ public:
           Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
       }
     }
-    assert(normalize(w,if_print) != 0);
+    GRID_ASSERT(normalize(w,if_print) != 0);
   }
   void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
   {
@@ -169,7 +169,7 @@ public:
       w[i] = w[i] - ip * evec[j];
     }}
     for(int i=0; i<_Nu; ++i)
-    assert(normalize(w[i],if_print) !=0);
+    GRID_ASSERT(normalize(w[i],if_print) !=0);
   }
   
   void orthogonalize_blockhead(Field& w, std::vector<Field>& evec, int k, int Nu)
@@ -205,8 +205,8 @@ public:
   {
     std::string fname = std::string(cname+"::calc_irbl()"); 
     GridBase *grid = evec[0].Grid();
-    assert(grid == src[0].Grid());
-    assert( Nu = src.size() );
+    GRID_ASSERT(grid == src[0].Grid());
+    GRID_ASSERT( Nu = src.size() );
     
     Glog << std::string(74,'*') << std::endl;
     Glog << fname + " starting iteration 0 /  "<< MaxIter<< std::endl;
@@ -227,7 +227,7 @@ public:
     }
     Glog << std::string(74,'*') << std::endl;
     
-    assert(Nm == evec.size() && Nm == eval.size());
+    GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
 
     std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));  
     std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));  
@@ -413,8 +413,8 @@ public:
   {
     std::string fname = std::string(cname+"::calc_rbl()"); 
     GridBase *grid = evec[0].Grid();
-    assert(grid == src[0].Grid());
-    assert( Nu = src.size() );
+    GRID_ASSERT(grid == src[0].Grid());
+    GRID_ASSERT( Nu = src.size() );
 
     int Np = (Nm-Nk);
     if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -441,7 +441,7 @@ public:
     }
     Glog << std::string(74,'*') << std::endl;
     
-    assert(Nm == evec.size() && Nm == eval.size());
+    GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
 	
     std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));  
     std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));  
@@ -622,7 +622,7 @@ private:
     
     int Nu = w.size();
     int Nm = evec.size();
-    assert( b < Nm/Nu );
+    GRID_ASSERT( b < Nm/Nu );
     
     // converts block index to full indicies for an interval [L,R)
     int L = Nu*b;
@@ -630,7 +630,7 @@ private:
 
     Real beta;
 
-    assert((Nu%mrhs)==0);
+    GRID_ASSERT((Nu%mrhs)==0);
     std::vector<Field>   in(mrhs,f_grid);
     std::vector<Field>   out(mrhs,f_grid);
 
@@ -711,7 +711,7 @@ private:
     
     for (int u=0; u<Nu; ++u) {
       //      Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
-      assert (!isnan(norm2(w[u])));
+      GRID_ASSERT (!isnan(norm2(w[u])));
       for (int k=L+u; k<R; ++k) {
 	//        Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
       }
@@ -734,8 +734,8 @@ private:
 			 Eigen::MatrixXcd & Qt, // Nm x Nm
 			 GridBase *grid)
   {
-    assert( Nk%Nu == 0 && Nm%Nu == 0 );
-    assert( Nk <= Nm );
+    GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
+    GRID_ASSERT( Nk <= Nm );
     Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
     
     for ( int u=0; u<Nu; ++u ) {
@@ -775,8 +775,8 @@ private:
 			 GridBase *grid)
   {
     Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
-    assert( Nk%Nu == 0 && Nm%Nu == 0 );
-    assert( Nk <= Nm );
+    GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
+    GRID_ASSERT( Nk <= Nm );
     Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
     
     for ( int u=0; u<Nu; ++u ) {
@@ -924,7 +924,7 @@ if (1){
       diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
 #endif
     } else { 
-      assert(0);
+      GRID_ASSERT(0);
     }
   }
   
@@ -936,8 +936,8 @@ if (1){
          Eigen::MatrixXcd& M)
   {
     //    Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; 
-    assert( Nk%Nu == 0 && Nm%Nu == 0 );
-    assert( Nk <= Nm );
+    GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
+    GRID_ASSERT( Nk <= Nm );
     M = Eigen::MatrixXcd::Zero(Nk,Nk);
     
     // rearrange 
@@ -964,8 +964,8 @@ if (1){
          Eigen::MatrixXcd& M)
   {
     //    Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; 
-    assert( Nk%Nu == 0 && Nm%Nu == 0 );
-    assert( Nk <= Nm );
+    GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
+    GRID_ASSERT( Nk <= Nm );
     
     // rearrange 
     for ( int u=0; u<Nu; ++u ) {
diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
index df2007d2..397aa78d 100644
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -226,7 +226,7 @@ until convergence
   void calc(std::vector<RealD>& eval, std::vector<Field>& evec,  const Field& src, int& Nconv, bool reverse=false)
   {
     GridBase *grid = src.Grid();
-    assert(grid == evec[0].Grid());
+    GRID_ASSERT(grid == evec[0].Grid());
     
     //    GridLogIRL.TimingMode(1);
     std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -246,7 +246,7 @@ until convergence
     }
     std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
 	
-    assert(Nm <= evec.size() && Nm <= eval.size());
+    GRID_ASSERT(Nm <= evec.size() && Nm <= eval.size());
     
     // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
     RealD evalMaxApprox = 0.0;
@@ -357,7 +357,7 @@ until convergence
       }
       std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
 
-      assert(k2<Nm);      assert(k2<Nm);      assert(k1>0);
+      GRID_ASSERT(k2<Nm);      GRID_ASSERT(k2<Nm);      GRID_ASSERT(k1>0);
 
       basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
       std::cout<<GridLogIRL <<"basisRotated  by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
@@ -484,7 +484,7 @@ until convergence
   {
     std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
     const RealD tiny = 1.0e-20;
-    assert( k< Nm );
+    GRID_ASSERT( k< Nm );
 
     GridStopWatch gsw_op,gsw_o;
 
@@ -618,7 +618,7 @@ until convergence
     }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
       diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
     } else { 
-      assert(0);
+      GRID_ASSERT(0);
     }
   }
 
@@ -708,7 +708,7 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
     }
   }
 #else 
-  assert(0);
+  GRID_ASSERT(0);
 #endif
 }
 
diff --git a/Grid/algorithms/iterative/KrylovSchur.h b/Grid/algorithms/iterative/KrylovSchur.h
index 71469cc2..0ebf29d2 100644
--- a/Grid/algorithms/iterative/KrylovSchur.h
+++ b/Grid/algorithms/iterative/KrylovSchur.h
@@ -30,7 +30,17 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef GRID_KRYLOVSCHUR_H
 #define GRID_KRYLOVSCHUR_H
 
-NAMESPACE_BEGIN(Grid); 
+NAMESPACE_BEGIN(Grid);
+
+#if defined(GRID_CUDA) || defined(GRID_HIP)
+using thrust::abs;
+using thrust::conj;
+inline std::complex<double> toStdCmplx(const ComplexD& c) { return {c.real(), c.imag()}; }
+#else
+using std::abs;
+using std::conj;
+inline const ComplexD& toStdCmplx(const ComplexD& c) { return c; }
+#endif
 
 /**
  * Options for which Ritz values to keep in implicit restart. TODO move this and utilities into a new file
@@ -209,8 +219,8 @@ class ComplexSchurDecomposition {
       s    = S(i, i+1);
       lam1 = S(i, i);
       lam2 = S(i+1, i+1);
-      phi  = s / std::abs(s);
-      r    = std::sqrt(std::pow(std::abs(s), 2) + std::pow(std::abs(lam2 - lam1), 2));
+      phi  = s / abs(s);
+      r    = std::sqrt(std::pow(abs(s), 2) + std::pow(abs(lam2 - lam1), 2));
 
       // // Original code which performs Givens rotations by manual matrix multiplication
       // // compute Givens rotation corresponding to these parameters
@@ -225,10 +235,10 @@ class ComplexSchurDecomposition {
 
       // Modified code
       Givens = CMat::Identity(2, 2);
-      Givens(0, 0)     = std::abs(s) / r;
+      Givens(0, 0)     = abs(s) / r;
       Givens(1, 1) = Givens(0, 0);
-      Givens(0, 1)   = (phi / r) * std::conj(lam2 - lam1);
-      Givens(1, 0)   = -std::conj(Givens(0, 1));
+      Givens(0, 1)   = (phi / r) * conj(lam2 - lam1);
+      Givens(1, 0)   = -conj(Givens(0, 1));
 
       // TODO: make sure these are correct
       Eigen::MatrixXcd tmp;
@@ -661,7 +671,7 @@ if (!shift){
         // Linop.Op(basis[i], w);
         for (int j = 0; j < basis.size(); j++) {
           coeff = innerProduct(basis[j], w);       // coeff = h_{ij}. Note that since {vi} is ONB it's OK to subtract it off after. 
-          Rayleigh(j, i) = coeff;
+          Rayleigh(j, i) = toStdCmplx(coeff);
           w -= coeff * basis[j];
         }
 
@@ -669,7 +679,7 @@ if (!shift){
           std::cout << GridLogDebug << "Double orthogonalizing." << std::endl;
           for (int j = 0; j < basis.size(); j++) {
             coeff = innerProduct(basis[j], w);      // see if there is any residual component in basis[j] direction
-            Rayleigh(j, i) += coeff;                // if coeff is non-zero, adjust Rayleigh
+            Rayleigh(j, i) += toStdCmplx(coeff);   // if coeff is non-zero, adjust Rayleigh
             w -= coeff * basis[j];
           }
         }
@@ -677,7 +687,7 @@ if (!shift){
         // add w_i to the pile
         if (i < Nm - 1) {
           coeff = std::sqrt(norm2(w));
-          Rayleigh(i+1, i) = coeff;
+          Rayleigh(i+1, i) = toStdCmplx(coeff);
           basis.push_back(
             (1.0/coeff) * w
           );
diff --git a/Grid/algorithms/iterative/LanczosBidiagonalization.h b/Grid/algorithms/iterative/LanczosBidiagonalization.h
new file mode 100644
index 00000000..4733d03e
--- /dev/null
+++ b/Grid/algorithms/iterative/LanczosBidiagonalization.h
@@ -0,0 +1,276 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./Grid/algorithms/iterative/LanczosBidiagonalization.h
+
+Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LANCZOS_BIDIAGONALIZATION_H
+#define GRID_LANCZOS_BIDIAGONALIZATION_H
+
+NAMESPACE_BEGIN(Grid);
+
+/**
+ * Lanczos Bidiagonalization (Golub-Kahan)
+ *
+ * For a linear operator A with adjoint A^dag, constructs the bidiagonal
+ * decomposition:
+ *
+ *   A  V_m = U_m B_m
+ *   A^dag U_m = V_m B_m^T + beta_{m+1} v_{m+1} e_m^T
+ *
+ * where:
+ *   V_m = [v_1, ..., v_m]  right Lanczos vectors (orthonormal)
+ *   U_m = [u_1, ..., u_m]  left  Lanczos vectors (orthonormal)
+ *   B_m is upper bidiagonal with diag(alpha_1,...,alpha_m) and
+ *       superdiag(beta_2,...,beta_m)
+ *
+ * The singular values of A are approximated by those of B_m.
+ * The singular values of B_m are the square roots of the eigenvalues of
+ * the symmetric tridiagonal matrix B_m^T B_m.
+ *
+ * Usage:
+ *   LanczosBidiagonalization<Field> lb(Linop, grid);
+ *   lb.run(src, Nm, tol);
+ *   // Access results via getters.
+ */
+template <class Field>
+class LanczosBidiagonalization {
+
+  public: 
+  LinearOperatorBase<Field> &Linop;
+  GridBase *Grid;
+
+  int Nm;           // number of Lanczos steps taken
+  RealD Tolerance;  // convergence threshold on beta_{k+1} / alpha_k
+
+  std::vector<Field>  V;       // right Lanczos vectors v_1 ... v_m
+  std::vector<Field>  U;       // left  Lanczos vectors u_1 ... u_m
+  std::vector<RealD>  alpha;   // diagonal of bidiagonal matrix
+  std::vector<RealD>  beta;    // super-diagonal (beta[k] couples u_k and v_{k+1})
+
+  // SVD of the bidiagonal matrix (filled after computeSVD())
+  Eigen::VectorXd  singularValues;
+  Eigen::MatrixXd  leftSVecs;   // columns are left  singular vectors of B
+  Eigen::MatrixXd  rightSVecs;  // columns are right singular vectors of B
+
+public:
+
+  LanczosBidiagonalization(LinearOperatorBase<Field> &_Linop, GridBase *_Grid,
+                           RealD _tol = 1.0e-8)
+    : Linop(_Linop), Grid(_Grid), Tolerance(_tol), Nm(0)
+  {}
+
+  /**
+   * Run the Golub-Kahan Lanczos bidiagonalization.
+   *
+   * Parameters
+   * ----------
+   * src  : starting vector (need not be normalised)
+   * Nmax : maximum number of Lanczos steps
+   * reorth : if true, full reorthogonalisation of both V and U bases
+   */
+  void run(const Field &src, int Nmax, bool reorth = true)
+  {
+    assert(norm2(src) > 0.0);
+
+    V.clear(); U.clear();
+    alpha.clear(); beta.clear();
+    Nm = 0;
+
+    Field p(Grid), r(Grid);
+
+    // --- initialise: v_1 = src / ||src|| ---
+    Field v(Grid);
+    v = src;
+    RealD nrm = std::sqrt(norm2(v));
+    v = (1.0 / nrm) * v;
+    V.push_back(v);
+
+    for (int k = 0; k < Nmax; ++k) {
+
+      // p = A v_k
+      Linop.Op(V[k], p);
+
+      // p = p - beta_k * u_{k-1}   (remove previous left vector)
+      if (k > 0) {
+        p = p - beta[k-1] * U[k-1];
+      }
+
+      // alpha_k = ||p||
+      RealD ak = std::sqrt(norm2(p));
+      if (ak < 1.0e-14) {
+        std::cout << GridLogMessage
+                  << "LanczosBidiagonalization: lucky breakdown at step "
+                  << k << " (alpha = " << ak << ")" << std::endl;
+        break;
+      }
+      alpha.push_back(ak);
+
+      // u_k = p / alpha_k
+      Field u(Grid);
+      u = (1.0 / ak) * p;
+
+      // full reorthogonalisation of u against previous U
+      if (reorth) {
+        for (int j = 0; j < (int)U.size(); ++j) {
+          ComplexD ip = innerProduct(U[j], u);
+          u = u - ip * U[j];
+        }
+        RealD unrm = std::sqrt(norm2(u));
+        if (unrm > 1.0e-14) u = (1.0 / unrm) * u;
+      }
+      U.push_back(u);
+
+      // r = A^dag u_k - alpha_k * v_k
+      Linop.AdjOp(U[k], r);
+      r = r - ak * V[k];
+
+      // full reorthogonalisation of r against previous V
+      if (reorth) {
+        for (int j = 0; j < (int)V.size(); ++j) {
+          ComplexD ip = innerProduct(V[j], r);
+          r = r - ip * V[j];
+        }
+      }
+
+      // beta_{k+1} = ||r||
+      RealD bk = std::sqrt(norm2(r));
+      beta.push_back(bk);
+
+      Nm = k + 1;
+
+      std::cout << GridLogMessage
+                << "LanczosBidiagonalization step " << k
+                << "  alpha = " << ak
+                << "  beta  = " << bk << std::endl;
+
+      // convergence: residual beta / alpha small enough
+      if (bk / ak < Tolerance) {
+        std::cout << GridLogMessage
+                  << "LanczosBidiagonalization converged at step " << k
+                  << "  (beta/alpha = " << bk / ak << ")" << std::endl;
+        break;
+      }
+
+      if (k == Nmax - 1) break;   // no v_{k+2} needed after last step
+
+      // v_{k+1} = r / beta_{k+1}
+      Field vnext(Grid);
+      vnext = (1.0 / bk) * r;
+      V.push_back(vnext);
+    }
+  }
+
+  /**
+   * Compute the SVD of the bidiagonal matrix B using Eigen.
+   * Singular values are stored in descending order.
+   */
+  void computeSVD()
+  {
+    int m = Nm;
+    Eigen::MatrixXd B = Eigen::MatrixXd::Zero(m, m);
+
+    for (int k = 0; k < m; ++k) {
+      B(k, k) = alpha[k];
+      if (k + 1 < m && k < (int)beta.size())
+        B(k, k+1) = beta[k];
+    }
+
+    Eigen::JacobiSVD<Eigen::MatrixXd> svd(B,
+        Eigen::ComputeThinU | Eigen::ComputeThinV);
+
+    singularValues = svd.singularValues();   // already sorted descending
+    leftSVecs      = svd.matrixU();
+    rightSVecs     = svd.matrixV();
+
+    std::cout << GridLogMessage
+              << "LanczosBidiagonalization: singular values of B_" << m
+              << std::endl;
+    for (int k = 0; k < m; ++k)
+      std::cout << GridLogMessage << "  sigma[" << k << "] = "
+                << singularValues(k) << std::endl;
+  }
+
+  /**
+   * Return the k-th approximate left singular vector of A in the full
+   * lattice space.  computeSVD() must have been called first.
+   */
+  Field leftSingularVector(int k)
+  {
+    assert(k < (int)leftSVecs.cols());
+    Field svec(Grid);
+    svec = Zero();
+    for (int j = 0; j < Nm; ++j)
+      svec = svec + leftSVecs(j, k) * U[j];
+    return svec;
+  }
+
+  /**
+   * Return the k-th approximate right singular vector of A in the full
+   * lattice space.  computeSVD() must have been called first.
+   */
+  Field rightSingularVector(int k)
+  {
+    assert(k < (int)rightSVecs.cols());
+    Field svec(Grid);
+    svec = Zero();
+    for (int j = 0; j < Nm; ++j)
+      svec = svec + rightSVecs(j, k) * V[j];
+    return svec;
+  }
+
+  /**
+   * Verify the bidiagonalization: returns max residual
+   *   max_k || A v_k - alpha_k u_k - beta_k u_{k-1} ||
+   */
+  RealD verify()
+  {
+    Field tmp(Grid);
+    RealD maxres = 0.0;
+    for (int k = 0; k < Nm; ++k) {
+      Linop.Op(V[k], tmp);
+      tmp = tmp - alpha[k] * U[k];
+      if (k > 0 && k-1 < (int)beta.size())
+        tmp = tmp - beta[k-1] * U[k-1];
+      RealD res = std::sqrt(norm2(tmp));
+      if (res > maxres) maxres = res;
+      std::cout << GridLogMessage
+                << "LanczosBidiagonalization verify step " << k
+                << "  ||A v_k - alpha_k u_k - beta_{k-1} u_{k-1}|| = "
+                << res << std::endl;
+    }
+    return maxres;
+  }
+
+  /* Getters */
+  int                       getNm()           const { return Nm; }
+  const std::vector<Field>& getV()            const { return V; }
+  const std::vector<Field>& getU()            const { return U; }
+  const std::vector<RealD>& getAlpha()        const { return alpha; }
+  const std::vector<RealD>& getBeta()         const { return beta; }
+  Eigen::VectorXd           getSingularValues() const { return singularValues; }
+};
+
+NAMESPACE_END(Grid);
+#endif
diff --git a/Grid/algorithms/iterative/LocalCoherenceLanczos.h b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
index 344a785a..f7da43c1 100644
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -80,7 +80,7 @@ public:
   ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
     _Linop(linop), subspace(_subspace)
   {  
-    assert(subspace.size() >0);
+    GRID_ASSERT(subspace.size() >0);
   };
 
   void operator()(const CoarseField& in, CoarseField& out) {
@@ -346,12 +346,12 @@ public:
 
   void testFine(RealD resid) 
   {
-    assert(evals_fine.size() == nbasis);
-    assert(subspace.size() == nbasis);
+    GRID_ASSERT(evals_fine.size() == nbasis);
+    GRID_ASSERT(subspace.size() == nbasis);
     PlainHermOp<FineField>    Op(_FineOp);
     ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
     for(int k=0;k<nbasis;k++){
-      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
+      GRID_ASSERT(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
     }
   }
 
@@ -359,8 +359,8 @@ public:
   //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
   void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
   {
-    assert(evals_fine.size() == nbasis);
-    assert(subspace.size() == nbasis);
+    GRID_ASSERT(evals_fine.size() == nbasis);
+    GRID_ASSERT(subspace.size() == nbasis);
     //////////////////////////////////////////////////////////////////////////////////////////////////
     // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
     //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -380,7 +380,7 @@ public:
   void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, 
 		RealD MaxIt, RealD betastp, int MinRes)
   {
-    assert(nbasis<=Nm);
+    GRID_ASSERT(nbasis<=Nm);
     Chebyshev<FineField>      Cheby(cheby_parms);
     FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
     PlainHermOp<FineField>    Op(_FineOp);
@@ -400,8 +400,8 @@ public:
     IRL.calc(evals_fine,subspace,src,Nconv,false);
     
     // Shrink down to number saved
-    assert(Nstop>=nbasis);
-    assert(Nconv>=nbasis);
+    GRID_ASSERT(Nstop>=nbasis);
+    GRID_ASSERT(Nconv>=nbasis);
     evals_fine.resize(nbasis);
     subspace.resize(nbasis,_FineGrid);
   }
@@ -433,7 +433,7 @@ public:
     ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
     int Nconv=0;
     IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
-    assert(Nconv>=Nstop);
+    GRID_ASSERT(Nconv>=Nstop);
     evals_coarse.resize(Nstop);
     evec_coarse.resize (Nstop,_CoarseGrid);
     for (int i=0;i<Nstop;i++){
diff --git a/Grid/algorithms/iterative/MinimalResidual.h b/Grid/algorithms/iterative/MinimalResidual.h
index 33b79ac2..70e29754 100644
--- a/Grid/algorithms/iterative/MinimalResidual.h
+++ b/Grid/algorithms/iterative/MinimalResidual.h
@@ -35,7 +35,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
  public:
   using OperatorFunction<Field>::operator();
 
-  bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
+  bool ErrorOnNoConverge; // throw an GRID_ASSERT when the MR fails to converge.
                           // Defaults true.
   RealD   Tolerance;
   Integer MaxIterations;
@@ -59,7 +59,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
 
     // Initial residual computation & set up
     RealD guess = norm2(psi);
-    assert(std::isnan(guess) == 0);
+    GRID_ASSERT(std::isnan(guess) == 0);
 
     RealD ssq = norm2(src);
     RealD rsq = Tolerance * Tolerance * ssq;
@@ -136,7 +136,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
         std::cout << GridLogMessage << "MR Time elapsed: Linalg  " << LinalgTimer.Elapsed() << std::endl;
 
         if (ErrorOnNoConverge)
-          assert(true_residual / Tolerance < 10000.0);
+          GRID_ASSERT(true_residual / Tolerance < 10000.0);
 
         IterationsToComplete = k;
 
@@ -148,7 +148,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
               << std::endl;
 
     if (ErrorOnNoConverge)
-      assert(0);
+      GRID_ASSERT(0);
 
     IterationsToComplete = k;
   }
diff --git a/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h b/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
index d75fdb63..7a297836 100644
--- a/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
@@ -37,7 +37,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
 
   using OperatorFunction<FieldD>::operator();
 
-  bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
+  bool ErrorOnNoConverge; // Throw an GRID_ASSERT when MPFGMRES fails to converge,
                           // defaults to true
 
   RealD   Tolerance;
@@ -91,7 +91,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
     conformable(psi, src);
 
     RealD guess = norm2(psi);
-    assert(std::isnan(guess) == 0);
+    GRID_ASSERT(std::isnan(guess) == 0);
 
     RealD cp;
     RealD ssq = norm2(src);
@@ -150,7 +150,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
     std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
 
     if (ErrorOnNoConverge)
-      assert(0);
+      GRID_ASSERT(0);
   }
 
   RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
@@ -197,7 +197,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
       }
     }
 
-    assert(0); // Never reached
+    GRID_ASSERT(0); // Never reached
     return cp;
   }
 
diff --git a/Grid/algorithms/iterative/PrecConjugateResidual.h b/Grid/algorithms/iterative/PrecConjugateResidual.h
index b6178833..91f41495 100644
--- a/Grid/algorithms/iterative/PrecConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecConjugateResidual.h
@@ -112,7 +112,7 @@ public:
     }
 
     std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
-    assert(0);
+    GRID_ASSERT(0);
   }
 };
 NAMESPACE_END(Grid);
diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
index feceb21f..388022be 100644
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -118,7 +118,7 @@ public:
 
     }
     GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
-    //    assert(0);
+    //    GRID_ASSERT(0);
   }
 
   RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -221,7 +221,7 @@ public:
       int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history.
       for(int back=0;back<northog;back++){
 
-	int peri_back=(k-back)%mmax;   	  assert((k-back)>=0);
+	int peri_back=(k-back)%mmax;   	  GRID_ASSERT((k-back)>=0);
 
 	b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
 	p[peri_kp]=p[peri_kp]+b*p[peri_back];
@@ -231,7 +231,7 @@ public:
       qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
       LinalgTimer.Stop();
     }
-    assert(0); // never reached
+    GRID_ASSERT(0); // never reached
     return cp;
   }
 };
diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
index 7bac5667..b0166ab5 100644
--- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -123,7 +123,7 @@ public:
 
     }
     GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
-    //    assert(0);
+    //    GRID_ASSERT(0);
   }
 
   RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -268,8 +268,7 @@ public:
       // we iterate backwards counting down from the current k+1 index (peri_kp) because we 
       for(int back=0;back<northog;back++){
 
-        int peri_back=(k-back)%mmax;   	  assert((k-back)>=0);
-        // std::cout << "peri_back: " << peri_back << std::endl;
+	int peri_back=(k-back)%mmax;   	  GRID_ASSERT((k-back)>=0);
 
         // b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
         b=-(innerProduct(q[peri_back],Az))/qq[peri_back];     // TODO try complex beta
@@ -300,7 +299,7 @@ public:
       }
 
     }
-    assert(0); // never reached
+    GRID_ASSERT(0); // never reached
     return cp;
   }
 };
@@ -410,7 +409,10 @@ public:
     for(int i = 0; i < k; i++) {
       polynomial[i] += a * poly_p[k-1][i];
     }
-    PF.data.push_back(polynomial);
+    {
+      std::vector<std::complex<double>> poly_stdcmplx(polynomial.begin(), polynomial.end());
+      PF.data.push_back(poly_stdcmplx);
+    }
 
     //  r_{k+1} --> r_k - a_k A p_k
     //  p_{k+1} --> r_k + \sum_{i=0}^k \beta_{ik} p_i, input betas = (\beta_{ik})_i
@@ -436,8 +438,9 @@ public:
     /** Logs all alphas and betas to complete the iterations. */
     std::cout << "PGCR::LogComplete() "<<std::endl;
     for (int i = 0; i < alphas.size(); i++) {
-      PF.alphas.push_back(alphas[i]);
-      PF.betas.push_back(betas[i]);
+      PF.alphas.push_back(std::complex<double>(alphas[i].real(), alphas[i].imag()));
+      std::vector<std::complex<double>> beta_stdcmplx(betas[i].begin(), betas[i].end());
+      PF.betas.push_back(beta_stdcmplx);
     }
   };
 
diff --git a/Grid/algorithms/iterative/QuasiMinimalResidual.h b/Grid/algorithms/iterative/QuasiMinimalResidual.h
index ea5531b6..97a82cf9 100644
--- a/Grid/algorithms/iterative/QuasiMinimalResidual.h
+++ b/Grid/algorithms/iterative/QuasiMinimalResidual.h
@@ -79,7 +79,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
 
     LinOp.Op(x,r); r = b - r;
 
-    assert(normb> 0.0);
+    GRID_ASSERT(normb> 0.0);
 
     resid = norm2(r)/normb;
     if (resid <= Tolerance) {
@@ -105,8 +105,8 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
     for (int i = 1; i <= MaxIterations; i++) {
 
       // Breakdown tests
-      assert( rho != 0.0);
-      assert( xi  != 0.0);
+      GRID_ASSERT( rho != 0.0);
+      GRID_ASSERT( xi  != 0.0);
 
       v = (1. / rho) * v_tld;
       y = (1. / rho) * y;
@@ -134,10 +134,10 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
       ep=Zep.real();
       std::cout << "Zep "<<Zep <<std::endl;
       // Complex Audit
-      assert(abs(ep)>0);
+      GRID_ASSERT(abs(ep)>0);
 
       beta = ep / delta;
-      assert(abs(beta)>0);
+      GRID_ASSERT(abs(beta)>0);
 
       v_tld = p_tld - beta * v;
       y = v_tld;
@@ -158,7 +158,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
       std::cout << "theta "<<theta<<std::endl;
       std::cout << "gamma "<<gamma<<std::endl;
 
-      assert(abs(gamma)> 0.0);
+      GRID_ASSERT(abs(gamma)> 0.0);
 
       eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
 
@@ -178,7 +178,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
       }
       std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
     }
-    assert(0);
+    GRID_ASSERT(0);
     return;                            // no convergence
   }
 #else
diff --git a/Grid/algorithms/iterative/SchurRedBlack.h b/Grid/algorithms/iterative/SchurRedBlack.h
index 494aa77b..9e9110c3 100644
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -327,9 +327,9 @@ namespace Grid {
       /////////////////////////////////////////////////////
       // src_o = (source_o - Moe MeeInv source_e)
       /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
+      _Matrix.MooeeInv(src_e,tmp);     GRID_ASSERT(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      GRID_ASSERT( Mtmp.Checkerboard() ==Odd);     
+      tmp=src_o-Mtmp;                  GRID_ASSERT(  tmp.Checkerboard() ==Odd);     
 
       _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
     }
@@ -347,17 +347,17 @@ namespace Grid {
       ///////////////////////////////////////////////////
       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
       ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.Checkerboard()   ==Even);
-      src_e = src_e-tmp;               assert(  src_e.Checkerboard() ==Even);
-      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.Checkerboard() ==Even);
+      _Matrix.Meooe(sol_o,tmp);        GRID_ASSERT(  tmp.Checkerboard()   ==Even);
+      src_e = src_e-tmp;               GRID_ASSERT(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   GRID_ASSERT(  sol_e.Checkerboard() ==Even);
      
-      setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even);
-      setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd );
+      setCheckerboard(sol,sol_e); GRID_ASSERT(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o); GRID_ASSERT(  sol_o.Checkerboard() ==Odd );
     }
     virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
     {
       SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
-      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  GRID_ASSERT(sol_o.Checkerboard()==Odd);
     };
     virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
     {
@@ -396,13 +396,13 @@ namespace Grid {
       /////////////////////////////////////////////////////
       // src_o = Mdag * (source_o - Moe MeeInv source_e)
       /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
+      _Matrix.MooeeInv(src_e,tmp);     GRID_ASSERT(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      GRID_ASSERT( Mtmp.Checkerboard() ==Odd);     
+      tmp=src_o-Mtmp;                  GRID_ASSERT(  tmp.Checkerboard() ==Odd);     
 
       // get the right MpcDag
       SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+      _HermOpEO.MpcDag(tmp,src_o);     GRID_ASSERT(src_o.Checkerboard() ==Odd);       
 
     }
     virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -416,17 +416,17 @@ namespace Grid {
       ///////////////////////////////////////////////////
       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
       ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);          assert(  tmp.Checkerboard()   ==Even);
-      src_e_i = src_e-tmp;               assert(  src_e_i.Checkerboard() ==Even);
-      _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.Checkerboard() ==Even);
+      _Matrix.Meooe(sol_o,tmp);          GRID_ASSERT(  tmp.Checkerboard()   ==Even);
+      src_e_i = src_e-tmp;               GRID_ASSERT(  src_e_i.Checkerboard() ==Even);
+      _Matrix.MooeeInv(src_e_i,sol_e);   GRID_ASSERT(  sol_e.Checkerboard() ==Even);
      
-      setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even);
-      setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd );
+      setCheckerboard(sol,sol_e); GRID_ASSERT(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o); GRID_ASSERT(  sol_o.Checkerboard() ==Odd );
     }
     virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
     {
       SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
-      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  GRID_ASSERT(sol_o.Checkerboard()==Odd);
     };
     virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
     {
@@ -461,9 +461,9 @@ namespace Grid {
         /////////////////////////////////////////////////////
         // src_o = Mdag * (source_o - Moe MeeInv source_e)
         /////////////////////////////////////////////////////
-        _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even );
-        _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );     
-        src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );     
+        _Matrix.MooeeInv(src_e, tmp);   GRID_ASSERT(   tmp.Checkerboard() == Even );
+        _Matrix.Meooe   (tmp, Mtmp);    GRID_ASSERT(  Mtmp.Checkerboard() == Odd  );     
+        src_o -= Mtmp;                  GRID_ASSERT( src_o.Checkerboard() == Odd  );     
       }
       
       virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -478,18 +478,18 @@ namespace Grid {
         ///////////////////////////////////////////////////
         // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
         ///////////////////////////////////////////////////
-        _Matrix.Meooe(sol_o, tmp);         assert(     tmp.Checkerboard() == Even );
-        src_e_i = src_e - tmp;             assert( src_e_i.Checkerboard() == Even );
-        _Matrix.MooeeInv(src_e_i, sol_e);  assert(   sol_e.Checkerboard() == Even );
+        _Matrix.Meooe(sol_o, tmp);         GRID_ASSERT(     tmp.Checkerboard() == Even );
+        src_e_i = src_e - tmp;             GRID_ASSERT( src_e_i.Checkerboard() == Even );
+        _Matrix.MooeeInv(src_e_i, sol_e);  GRID_ASSERT(   sol_e.Checkerboard() == Even );
        
-        setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
-        setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd  );
+        setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
+        setCheckerboard(sol, sol_o); GRID_ASSERT( sol_o.Checkerboard() == Odd  );
       }
 
       virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
       {
         NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
-        this->_HermitianRBSolver(_OpEO, src_o, sol_o);  assert(sol_o.Checkerboard() == Odd);
+        this->_HermitianRBSolver(_OpEO, src_o, sol_o);  GRID_ASSERT(sol_o.Checkerboard() == Odd);
       }
 
       virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
@@ -539,13 +539,13 @@ namespace Grid {
       /////////////////////////////////////////////////////
       // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
       /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      _Matrix.MooeeInv(src_e,tmp);     GRID_ASSERT(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      GRID_ASSERT( Mtmp.Checkerboard() ==Odd);     
       Mtmp=src_o-Mtmp;                 
-      _Matrix.MooeeInv(Mtmp,tmp);      assert( tmp.Checkerboard() ==Odd);     
+      _Matrix.MooeeInv(Mtmp,tmp);      GRID_ASSERT( tmp.Checkerboard() ==Odd);     
       
       // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+      _HermOpEO.MpcDag(tmp,src_o);     GRID_ASSERT(src_o.Checkerboard() ==Odd);       
     }
 
     virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -560,12 +560,12 @@ namespace Grid {
       ///////////////////////////////////////////////////
       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
       ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o,tmp);    assert(  tmp.Checkerboard()   ==Even);
-      tmp = src_e-tmp;             assert(  src_e.Checkerboard() ==Even);
-      _Matrix.MooeeInv(tmp,sol_e); assert(  sol_e.Checkerboard() ==Even);
+      _Matrix.Meooe(sol_o,tmp);    GRID_ASSERT(  tmp.Checkerboard()   ==Even);
+      tmp = src_e-tmp;             GRID_ASSERT(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT(  sol_e.Checkerboard() ==Even);
      
-      setCheckerboard(sol,sol_e);  assert(  sol_e.Checkerboard() ==Even);
-      setCheckerboard(sol,sol_o);  assert(  sol_o.Checkerboard() ==Odd );
+      setCheckerboard(sol,sol_e);  GRID_ASSERT(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o);  GRID_ASSERT(  sol_o.Checkerboard() ==Odd );
     };
 
     virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@@ -612,12 +612,12 @@ namespace Grid {
       /////////////////////////////////////////////////////
       // src_o = Mdag * (source_o - Moe MeeInv source_e)
       /////////////////////////////////////////////////////
-      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
-      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
-      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
+      _Matrix.MooeeInv(src_e,tmp);     GRID_ASSERT(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      GRID_ASSERT( Mtmp.Checkerboard() ==Odd);     
+      tmp=src_o-Mtmp;                  GRID_ASSERT(  tmp.Checkerboard() ==Odd);     
 
       // get the right MpcDag
-      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+      _HermOpEO.MpcDag(tmp,src_o);     GRID_ASSERT(src_o.Checkerboard() ==Odd);       
     }
 
     virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -638,12 +638,12 @@ namespace Grid {
       ///////////////////////////////////////////////////
       // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
       ///////////////////////////////////////////////////
-      _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.Checkerboard()   ==Even);
-      tmp = src_e-tmp;               assert(  src_e.Checkerboard() ==Even);
-      _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.Checkerboard() ==Even);
+      _Matrix.Meooe(sol_o_i,tmp);    GRID_ASSERT(  tmp.Checkerboard()   ==Even);
+      tmp = src_e-tmp;               GRID_ASSERT(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(tmp,sol_e);   GRID_ASSERT(  sol_e.Checkerboard() ==Even);
      
-      setCheckerboard(sol,sol_e);    assert(  sol_e.Checkerboard() ==Even);
-      setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.Checkerboard() ==Odd );
+      setCheckerboard(sol,sol_e);    GRID_ASSERT(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o_i);  GRID_ASSERT(  sol_o_i.Checkerboard() ==Odd );
     };
 
     virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@@ -684,9 +684,9 @@ namespace Grid {
         /////////////////////////////////////////////////////
         // src_o = Mdag * (source_o - Moe MeeInv source_e)
         /////////////////////////////////////////////////////
-        _Matrix.MooeeInv(src_e, tmp);   assert(   tmp.Checkerboard() == Even );
-        _Matrix.Meooe   (tmp, Mtmp);    assert(  Mtmp.Checkerboard() == Odd  );     
-        src_o -= Mtmp;                  assert( src_o.Checkerboard() == Odd  );     
+        _Matrix.MooeeInv(src_e, tmp);   GRID_ASSERT(   tmp.Checkerboard() == Even );
+        _Matrix.Meooe   (tmp, Mtmp);    GRID_ASSERT(  Mtmp.Checkerboard() == Odd  );     
+        src_o -= Mtmp;                  GRID_ASSERT( src_o.Checkerboard() == Odd  );     
       }
 
       virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -707,12 +707,12 @@ namespace Grid {
         ///////////////////////////////////////////////////
         // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
         ///////////////////////////////////////////////////
-        _Matrix.Meooe(sol_o_i, tmp);    assert(   tmp.Checkerboard() == Even );
-        tmp = src_e - tmp;              assert( src_e.Checkerboard() == Even );
-        _Matrix.MooeeInv(tmp, sol_e);   assert( sol_e.Checkerboard() == Even );
+        _Matrix.Meooe(sol_o_i, tmp);    GRID_ASSERT(   tmp.Checkerboard() == Even );
+        tmp = src_e - tmp;              GRID_ASSERT( src_e.Checkerboard() == Even );
+        _Matrix.MooeeInv(tmp, sol_e);   GRID_ASSERT( sol_e.Checkerboard() == Even );
        
-        setCheckerboard(sol, sol_e);    assert(   sol_e.Checkerboard() == Even );
-        setCheckerboard(sol, sol_o_i);  assert( sol_o_i.Checkerboard() == Odd  );
+        setCheckerboard(sol, sol_e);    GRID_ASSERT(   sol_e.Checkerboard() == Even );
+        setCheckerboard(sol, sol_o_i);  GRID_ASSERT( sol_o_i.Checkerboard() == Odd  );
       };
 
       virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
diff --git a/Grid/algorithms/multigrid/Aggregates.h b/Grid/algorithms/multigrid/Aggregates.h
index 4a4c4816..fc527929 100644
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@@ -97,7 +97,7 @@ public:
 
     RealD scale;
 
-    ConjugateGradient<FineField> CG(1.0e-3,400,false);
+    ConjugateGradient<FineField> CG(1.0e-4,2000,false);
     FineField noise(FineGrid);
     FineField Mn(FineGrid);
 
@@ -131,7 +131,10 @@ public:
     RealD scale;
 
     TrivialPrecon<FineField> simple_fine;
-    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
+    //    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,10,DiracOp,simple_fine,30,30);
+    //    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,10,DiracOp,simple_fine,12,12);
+    //    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
+    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,10,10);
     FineField noise(FineGrid);
     FineField src(FineGrid);
     FineField guess(FineGrid);
@@ -146,16 +149,16 @@ public:
       
       DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
 
-      for(int i=0;i<2;i++){
+      for(int i=0;i<3;i++){
 	//  void operator() (const Field &src, Field &psi){
 #if 1
-	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
+	if (i==0)std::cout << GridLogMessage << " inverting on noise "<<std::endl;
 	src = noise;
 	guess=Zero();
 	GCR(src,guess);
 	subspace[b] = guess;
 #else
-	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
+	if (i==0)std::cout << GridLogMessage << " inverting on zero "<<std::endl;
 	src=Zero();
 	guess = noise;
 	GCR(src,guess);
@@ -167,7 +170,7 @@ public:
 
       }
 
-      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<" <f|OpDagOp|f>"<<norm2(Mn)<<std::endl;
       subspace[b]   = noise;
 
     }
@@ -292,7 +295,7 @@ public:
 	  
       }
     }
-    assert(b==nn);
+    GRID_ASSERT(b==nn);
   }
 
 
diff --git a/Grid/algorithms/multigrid/CoarsenedMatrix.h b/Grid/algorithms/multigrid/CoarsenedMatrix.h
index 60a5920c..aff84988 100644
--- a/Grid/algorithms/multigrid/CoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/CoarsenedMatrix.h
@@ -309,7 +309,7 @@ public:
     if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { 
       std::cout <<"MdirAll out size "<< out.size()<<std::endl;
       std::cout <<"MdirAll ndir "<< ndir<<std::endl;
-      assert(0);
+      GRID_ASSERT(0);
     }
     for(int p=0;p<ndir;p++){
       MdirCalc(in,out[p],p);
@@ -373,7 +373,7 @@ public:
     conformable(in.Grid(), _cbgrid);    // verifies half grid
     conformable(in.Grid(), out.Grid()); // drops the cb check
 
-    assert(in.Checkerboard() == Even);
+    GRID_ASSERT(in.Checkerboard() == Even);
     out.Checkerboard() = Odd;
 
     DhopInternal(StencilEven, Aodd, in, out, dag);
@@ -383,7 +383,7 @@ public:
     conformable(in.Grid(), _cbgrid);    // verifies half grid
     conformable(in.Grid(), out.Grid()); // drops the cb check
 
-    assert(in.Checkerboard() == Odd);
+    GRID_ASSERT(in.Checkerboard() == Odd);
     out.Checkerboard() = Even;
 
     DhopInternal(StencilOdd, Aeven, in, out, dag);
@@ -391,7 +391,7 @@ public:
 
   void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
     out.Checkerboard() = in.Checkerboard();
-    assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+    GRID_ASSERT(in.Checkerboard() == Odd || in.Checkerboard() == Even);
 
     CoarseMatrix *Aself = nullptr;
     if(in.Grid()->_isCheckerBoarded) {
@@ -406,7 +406,7 @@ public:
       Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
       DselfInternal(Stencil, *Aself, in, out, dag);
     }
-    assert(Aself != nullptr);
+    GRID_ASSERT(Aself != nullptr);
   }
 
   void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
@@ -697,7 +697,7 @@ public:
     evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
     oddmask  = one-evenmask;
 
-    assert(self_stencil!=-1);
+    GRID_ASSERT(self_stencil!=-1);
 
     for(int i=0;i<nbasis;i++){
 
diff --git a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
index 8e6302b2..3ba145db 100644
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@@ -99,7 +99,7 @@ public:
 	}
       }
     }
-    assert(nfound==geom.npoint);
+    GRID_ASSERT(nfound==geom.npoint);
     ExchangeCoarseLinks();
   }
   */
@@ -124,7 +124,7 @@ public:
   }
   void Mdag (const CoarseVector &in, CoarseVector &out)
   {
-    assert(hermitian);
+    GRID_ASSERT(hermitian);
     Mult(_A,in,out);
     //    if ( hermitian ) M(in,out);
     //    else Mult(_Adag,in,out);
@@ -619,7 +619,7 @@ public:
       //      _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
     }
   }
-  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
+  virtual  void Mdiag    (const Field &in, Field &out){ GRID_ASSERT(0);};
   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
   virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
 };
diff --git a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
index 98f4b22c..c5de17a4 100644
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
@@ -80,12 +80,12 @@ public:
   // Can be used to do I/O on the operator matrices externally
   void SetMatrix (int p,CoarseMatrix & A)
   {
-    assert(A.size()==geom_srhs.npoint);
+    GRID_ASSERT(A.size()==geom_srhs.npoint);
     GridtoBLAS(A[p],BLAS_A[p]);
   }
   void GetMatrix (int p,CoarseMatrix & A)
   {
-    assert(A.size()==geom_srhs.npoint);
+    GRID_ASSERT(A.size()==geom_srhs.npoint);
     BLAStoGrid(A[p],BLAS_A[p]);
   }
   void CopyMatrix (GeneralCoarseOp &_Op)
@@ -178,14 +178,14 @@ public:
 	for(int32_t point = 0 ; point < geom.npoint; point++){
 	  int i=s*orhs*geom.npoint+point;
  	  int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
-	  assert(nbr<BLAS_B.size());
+	  GRID_ASSERT(nbr<BLAS_B.size());
 	  ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
 	  acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
 	}
 	j++;
       }
     }
-    assert(j==unpadded_sites);
+    GRID_ASSERT(j==unpadded_sites);
   }
   template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
   {
@@ -194,7 +194,7 @@ public:
   typedef typename vobj::vector_type vector_type;
 
   GridBase *Fg = from.Grid();
-  assert(!Fg->_isCheckerBoarded);
+  GRID_ASSERT(!Fg->_isCheckerBoarded);
   int nd = Fg->_ndimension;
 
   to.resize(Fg->lSites());
@@ -241,10 +241,10 @@ public:
   typedef typename vobj::vector_type vector_type;
 
   GridBase *Tg = grid.Grid();
-  assert(!Tg->_isCheckerBoarded);
+  GRID_ASSERT(!Tg->_isCheckerBoarded);
   int nd = Tg->_ndimension;
   
-  assert(in.size()==Tg->lSites());
+  GRID_ASSERT(in.size()==Tg->lSites());
 
   Coordinate LocalLatt = Tg->LocalDimensions();
   size_t nsite = 1;
@@ -669,7 +669,7 @@ Grid : Message : 328.193436 s : CoarsenOperator mat    122213270 us
     const int Nsimd = CComplex::Nsimd();
 
     int64_t nrhs  =pin.Grid()->GlobalDimensions()[0];
-    assert(nrhs>=1);
+    GRID_ASSERT(nrhs>=1);
 
     RealD flops,bytes;
     int64_t osites=in.Grid()->oSites(); // unpadded
@@ -721,7 +721,7 @@ Grid : Message : 328.193436 s : CoarsenOperator mat    122213270 us
     //    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
     //    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
   };
-  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
+  virtual  void Mdiag    (const Field &in, Field &out){ GRID_ASSERT(0);};
   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
   virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
 };
diff --git a/Grid/algorithms/multigrid/Geometry.h b/Grid/algorithms/multigrid/Geometry.h
index e239484a..11942d94 100644
--- a/Grid/algorithms/multigrid/Geometry.h
+++ b/Grid/algorithms/multigrid/Geometry.h
@@ -67,8 +67,8 @@ public:
   }
 
   int point(int dir, int disp) {
-    assert(disp == -1 || disp == 0 || disp == 1);
-    assert(base+0 <= dir && dir < base+4);
+    GRID_ASSERT(disp == -1 || disp == 0 || disp == 1);
+    GRID_ASSERT(base+0 <= dir && dir < base+4);
 
     // directions faster index = new indexing
     // 4d (base = 0):
@@ -131,7 +131,7 @@ public:
 	return p;
       }
     }
-    assert(0);
+    GRID_ASSERT(0);
     return -1;
   }
   void BuildShifts(void)
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 316f201c..648b6d90 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -57,7 +57,7 @@ public:
     if ( (_Tp*)ptr == (_Tp *) NULL ) {
       printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
     }
-    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
+    GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
     return ptr;
   }
 
@@ -106,7 +106,7 @@ public:
     if ( (_Tp*)ptr == (_Tp *) NULL ) {
       printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
     }
-    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
+    GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
     return ptr;
   }
 
@@ -154,7 +154,7 @@ public:
     if ( (_Tp*)ptr == (_Tp *) NULL ) {
       printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
     }
-    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
+    GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
     return ptr;
   }
 
diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc
index 30a24c9c..008313b8 100644
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -292,7 +292,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
 #ifdef GRID_OMP
-  assert(omp_in_parallel()==0);
+  GRID_ASSERT(omp_in_parallel()==0);
 #endif 
 
   if (ncache == 0) return ptr;
@@ -345,7 +345,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
 #ifdef GRID_OMP
-  assert(omp_in_parallel()==0);
+  GRID_ASSERT(omp_in_parallel()==0);
 #endif 
   for(int e=0;e<ncache;e++){
     if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc
index 09afbcf7..0eda6317 100644
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -50,12 +50,12 @@ int   MemoryManager::EntryPresent(uint64_t CpuPtr)
 {
   if(AccViewTable.empty()) return 0;
 
-  auto count = AccViewTable.count(CpuPtr);  assert((count==0)||(count==1));
+  auto count = AccViewTable.count(CpuPtr);  GRID_ASSERT((count==0)||(count==1));
   return count;
 }
 void  MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
 {
-  assert(!EntryPresent(CpuPtr));
+  GRID_ASSERT(!EntryPresent(CpuPtr));
   AcceleratorViewEntry AccCache;
   AccCache.CpuPtr = CpuPtr;
   AccCache.AccPtr = (uint64_t)NULL;
@@ -69,9 +69,9 @@ void  MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,View
 }
 MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
 {
-  assert(EntryPresent(CpuPtr));
+  GRID_ASSERT(EntryPresent(CpuPtr));
   auto AccCacheIterator = AccViewTable.find(CpuPtr);
-  assert(AccCacheIterator!=AccViewTable.end());
+  GRID_ASSERT(AccCacheIterator!=AccViewTable.end());
   return AccCacheIterator;
 }
 void MemoryManager::EntryErase(uint64_t CpuPtr)
@@ -81,7 +81,7 @@ void MemoryManager::EntryErase(uint64_t CpuPtr)
 }
 void  MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
 {
-  assert(AccCache.LRU_valid==0);
+  GRID_ASSERT(AccCache.LRU_valid==0);
   if (AccCache.transient) { 
     LRU.push_back(AccCache.CpuPtr);
     AccCache.LRU_entry = --LRU.end();
@@ -94,7 +94,7 @@ void  MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
 }
 void  MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
 {
-  assert(AccCache.LRU_valid==1);
+  GRID_ASSERT(AccCache.LRU_valid==1);
   LRU.erase(AccCache.LRU_entry);
   AccCache.LRU_valid = 0;
   DeviceLRUBytes-=AccCache.bytes;
@@ -108,12 +108,12 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
   // Remove from Accelerator, remove entry, without flush
   // Cannot be locked. If allocated Must be in LRU pool.
   ///////////////////////////////////////////////////////////
-  assert(AccCache.state!=Empty);
+  GRID_ASSERT(AccCache.state!=Empty);
   
   dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
-  assert(AccCache.accLock==0);
-  assert(AccCache.cpuLock==0);
-  assert(AccCache.CpuPtr!=(uint64_t)NULL);
+  GRID_ASSERT(AccCache.accLock==0);
+  GRID_ASSERT(AccCache.cpuLock==0);
+  GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
   if(AccCache.AccPtr) {
     AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
     DeviceDestroy++;
@@ -138,7 +138,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
   //                          Take these OUT LRU queue when CPU locked?
   //                          Cannot take out the table as cpuLock data is important.
   ///////////////////////////////////////////////////////////////////////////
-  assert(AccCache.state!=Empty);
+  GRID_ASSERT(AccCache.state!=Empty);
   
   mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
@@ -162,11 +162,11 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 }
 void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
 {
-  assert(AccCache.state==AccDirty);
-  assert(AccCache.cpuLock==0);
-  assert(AccCache.accLock==0);
-  assert(AccCache.AccPtr!=(uint64_t)NULL);
-  assert(AccCache.CpuPtr!=(uint64_t)NULL);
+  GRID_ASSERT(AccCache.state==AccDirty);
+  GRID_ASSERT(AccCache.cpuLock==0);
+  GRID_ASSERT(AccCache.accLock==0);
+  GRID_ASSERT(AccCache.AccPtr!=(uint64_t)NULL);
+  GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
   acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
   mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
   DeviceToHostBytes+=AccCache.bytes;
@@ -175,10 +175,10 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
 }
 void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
 {
-  assert(AccCache.state==CpuDirty);
-  assert(AccCache.cpuLock==0);
-  assert(AccCache.accLock==0);
-  assert(AccCache.CpuPtr!=(uint64_t)NULL);
+  GRID_ASSERT(AccCache.state==CpuDirty);
+  GRID_ASSERT(AccCache.cpuLock==0);
+  GRID_ASSERT(AccCache.accLock==0);
+  GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
   if(AccCache.AccPtr==(uint64_t)NULL){
     AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
     DeviceBytes+=AccCache.bytes;
@@ -194,10 +194,10 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
 
 void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 {
-  assert(AccCache.state!=Empty);
-  assert(AccCache.cpuLock==0);
-  assert(AccCache.accLock==0);
-  assert(AccCache.CpuPtr!=(uint64_t)NULL);
+  GRID_ASSERT(AccCache.state!=Empty);
+  GRID_ASSERT(AccCache.cpuLock==0);
+  GRID_ASSERT(AccCache.accLock==0);
+  GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
   if(AccCache.AccPtr==(uint64_t)NULL){
     AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
     DeviceBytes+=AccCache.bytes;
@@ -216,7 +216,7 @@ void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
   } else if( (mode==CpuRead)||(mode==CpuWrite)){
     CpuViewClose((uint64_t)Ptr);
   } else { 
-    assert(0);
+    GRID_ASSERT(0);
   }
 }
 void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
@@ -228,7 +228,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
   } else if( (mode==CpuRead)||(mode==CpuWrite)){
     return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
   } else { 
-    assert(0);
+    GRID_ASSERT(0);
     return NULL;
   }
 }
@@ -237,10 +237,10 @@ void  MemoryManager::EvictVictims(uint64_t bytes)
   if(bytes>=DeviceMaxBytes) {
     printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
   }
-  assert(bytes<DeviceMaxBytes);
+  GRID_ASSERT(bytes<DeviceMaxBytes);
   while(bytes+DeviceLRUBytes > DeviceMaxBytes){
     if ( DeviceLRUBytes > 0){
-      assert(LRU.size()>0);
+      GRID_ASSERT(LRU.size()>0);
       uint64_t victim = LRU.back(); // From the LRU
       auto AccCacheIterator = EntryLookup(victim);
       auto & AccCache = AccCacheIterator->second;
@@ -264,9 +264,9 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
   if (!AccCache.AccPtr) {
     EvictVictims(bytes); 
   } 
-  assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
+  GRID_ASSERT((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
 
-  assert(AccCache.cpuLock==0);  // Programming error
+  GRID_ASSERT(AccCache.cpuLock==0);  // Programming error
 
   if(AccCache.state!=Empty) {
     dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
@@ -275,8 +275,8 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
 		    (uint64_t)AccCache.bytes,
 	            (uint64_t)bytes,
 		    (uint64_t)AccCache.accLock);
-    assert(AccCache.CpuPtr == CpuPtr);
-    assert(AccCache.bytes  ==bytes);
+    GRID_ASSERT(AccCache.CpuPtr == CpuPtr);
+    GRID_ASSERT(AccCache.bytes  ==bytes);
   }
 /*
  *  State transitions and actions
@@ -293,7 +293,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  *  AccWrite AccDirty   AccDirty       -        - 
  */
   if(AccCache.state==Empty) {
-    assert(AccCache.LRU_valid==0);
+    GRID_ASSERT(AccCache.LRU_valid==0);
     AccCache.CpuPtr = CpuPtr;
     AccCache.AccPtr = (uint64_t)NULL;
     AccCache.bytes  = bytes;
@@ -338,10 +338,10 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
     AccCache.accLock++;
     dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
   } else {
-    assert(0);
+    GRID_ASSERT(0);
   }
 
-  assert(AccCache.accLock>0);
+  GRID_ASSERT(AccCache.accLock>0);
   // If view is opened on device must remove from LRU
   if(AccCache.LRU_valid==1){
     // must possibly remove from LRU as now locked on GPU
@@ -362,8 +362,8 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
   auto AccCacheIterator = EntryLookup(CpuPtr);
   auto & AccCache = AccCacheIterator->second;
 
-  assert(AccCache.cpuLock==0);
-  assert(AccCache.accLock>0);
+  GRID_ASSERT(AccCache.cpuLock==0);
+  GRID_ASSERT(AccCache.accLock>0);
 
   AccCache.accLock--;
   // Move to LRU queue if not locked and close on device
@@ -379,8 +379,8 @@ void MemoryManager::CpuViewClose(uint64_t CpuPtr)
   auto AccCacheIterator = EntryLookup(CpuPtr);
   auto & AccCache = AccCacheIterator->second;
 
-  assert(AccCache.cpuLock>0);
-  assert(AccCache.accLock==0);
+  GRID_ASSERT(AccCache.cpuLock>0);
+  GRID_ASSERT(AccCache.accLock==0);
 
   AccCache.cpuLock--;
 }
@@ -413,12 +413,12 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
   //    EvictVictims(bytes);
   //  }
 
-  assert((mode==CpuRead)||(mode==CpuWrite));
-  assert(AccCache.accLock==0);  // Programming error
+  GRID_ASSERT((mode==CpuRead)||(mode==CpuWrite));
+  GRID_ASSERT(AccCache.accLock==0);  // Programming error
 
   if(AccCache.state!=Empty) {
-    assert(AccCache.CpuPtr == CpuPtr);
-    assert(AccCache.bytes==bytes);
+    GRID_ASSERT(AccCache.CpuPtr == CpuPtr);
+    GRID_ASSERT(AccCache.bytes==bytes);
   }
 
   if(AccCache.state==Empty) {
@@ -433,20 +433,20 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
     AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
     AccCache.cpuLock++;
   } else if(AccCache.state==Consistent) {
-    assert(AccCache.AccPtr != (uint64_t)NULL);
+    GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL);
     if(mode==CpuWrite)
       AccCache.state = CpuDirty;   // Consistent +CpuWrite => CpuDirty
     else 
       AccCache.state = Consistent; // Consistent +CpuRead  => Consistent
     AccCache.cpuLock++;
   } else if(AccCache.state==AccDirty) {
-    assert(AccCache.AccPtr != (uint64_t)NULL);
+    GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL);
     Flush(AccCache);
     if(mode==CpuWrite) AccCache.state = CpuDirty;   // AccDirty +CpuWrite => CpuDirty, Flush
     else            AccCache.state = Consistent; // AccDirty +CpuRead  => Consistent, Flush
     AccCache.cpuLock++;
   } else {
-    assert(0); // should be unreachable
+    GRID_ASSERT(0); // should be unreachable
   }
 
   AccCache.transient= transient? EvictNext : 0;
@@ -528,12 +528,12 @@ void MemoryManager::Audit(std::string s)
   std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
   for(auto it=LRU.begin();it!=LRU.end();it++){
     uint64_t cpuPtr = *it;
-    assert(EntryPresent(cpuPtr));
+    GRID_ASSERT(EntryPresent(cpuPtr));
     auto AccCacheIterator = EntryLookup(cpuPtr);
     auto & AccCache = AccCacheIterator->second;
     LruBytes2+=AccCache.bytes;
-    assert(AccCache.LRU_valid==1);
-    assert(AccCache.LRU_entry==it);
+    GRID_ASSERT(AccCache.LRU_valid==1);
+    GRID_ASSERT(AccCache.LRU_entry==it);
   }
   std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
 
@@ -552,7 +552,7 @@ void MemoryManager::Audit(std::string s)
     if( AccCache.LRU_valid ) LruCnt++;
     
     if ( AccCache.cpuLock || AccCache.accLock ) {
-      assert(AccCache.LRU_valid==0);
+      GRID_ASSERT(AccCache.LRU_valid==0);
 
       std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 		<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
@@ -561,16 +561,16 @@ void MemoryManager::Audit(std::string s)
 		<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
     }
 
-    assert( AccCache.cpuLock== 0 ) ;
-    assert( AccCache.accLock== 0 ) ;
+    GRID_ASSERT( AccCache.cpuLock== 0 ) ;
+    GRID_ASSERT( AccCache.accLock== 0 ) ;
   }
   std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
-  assert(LruBytes1==LruBytes2);
-  assert(LruBytes1==DeviceLRUBytes);
+  GRID_ASSERT(LruBytes1==LruBytes2);
+  GRID_ASSERT(LruBytes1==DeviceLRUBytes);
   std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
-  assert(AccBytes==DeviceBytes);
+  GRID_ASSERT(AccBytes==DeviceBytes);
   std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
-  assert(LruCnt == LRU.size());
+  GRID_ASSERT(LruCnt == LRU.size());
   std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
 
 }
diff --git a/Grid/allocator/MemoryStats.cc b/Grid/allocator/MemoryStats.cc
index 37269785..2c46dfab 100644
--- a/Grid/allocator/MemoryStats.cc
+++ b/Grid/allocator/MemoryStats.cc
@@ -10,16 +10,16 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
 {
 #ifdef __linux__
   int fd = open("/proc/self/pagemap", O_RDONLY);
-  assert(fd >= 0);
+  GRID_ASSERT(fd >= 0);
   const int page_size = 4096;
   uint64_t virt_pfn = (uint64_t)Buf / page_size;
   off_t offset = sizeof(uint64_t) * virt_pfn;
   uint64_t npages = (BYTES + page_size-1) / page_size;
   std::vector<uint64_t> pagedata(npages);
   uint64_t ret = lseek(fd, offset, SEEK_SET);
-  assert(ret == offset);
+  GRID_ASSERT(ret == offset);
   ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
-  assert(ret == sizeof(uint64_t) * npages);
+  GRID_ASSERT(ret == sizeof(uint64_t) * npages);
   int nhugepages = npages / 512;
   int n4ktotal, nnothuge;
   n4ktotal = 0;
diff --git a/Grid/cartesian/Cartesian_base.h b/Grid/cartesian/Cartesian_base.h
index 66400787..1a5049ca 100644
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -165,7 +165,7 @@ public:
     //
     if ( _simd_layout[dimension] > 2 ) { 
       for(int d=0;d<_ndimension;d++){
-	if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
+	if ( d != dimension ) GRID_ASSERT ( (_simd_layout[d]==1)  );
       }
       permute_type = RotateBit; // How to specify distance; this is not just direction.
       return permute_type;
@@ -187,7 +187,7 @@ public:
   inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; 
   inline int Nd    (void) const { return _ndimension;};
 
-  inline const Coordinate LocalStarts(void)             { return _lstart;    };
+  inline const Coordinate &LocalStarts(void)            { return _lstart;    };
   inline const Coordinate &FullDimensions(void)         { return _fdimensions;};
   inline const Coordinate &GlobalDimensions(void)       { return _gdimensions;};
   inline const Coordinate &LocalDimensions(void)        { return _ldimensions;};
@@ -216,11 +216,11 @@ public:
   // Global addressing
   ////////////////////////////////////////////////////////////////
   void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
-    assert(gidx< gSites());
+    GRID_ASSERT(gidx< gSites());
     Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
   }
   void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
-    assert(lidx<lSites());
+    GRID_ASSERT(lidx<lSites());
     Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
   }
   void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
diff --git a/Grid/cartesian/Cartesian_full.h b/Grid/cartesian/Cartesian_full.h
index 1d883dc7..9a20982d 100644
--- a/Grid/cartesian/Cartesian_full.h
+++ b/Grid/cartesian/Cartesian_full.h
@@ -128,10 +128,10 @@ public:
         // Use a reduced simd grid
         _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
         //std::cout << _ldimensions[d] << "  " << _gdimensions[d] << "  " << _processors[d] << std::endl;
-        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
+        GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]);
 
         _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
-        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
+        GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
 
         _lstart[d] = _processor_coor[d] * _ldimensions[d];
         _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
diff --git a/Grid/cartesian/Cartesian_red_black.h b/Grid/cartesian/Cartesian_red_black.h
index 63c8fb4a..f218c2b5 100644
--- a/Grid/cartesian/Cartesian_red_black.h
+++ b/Grid/cartesian/Cartesian_red_black.h
@@ -67,7 +67,7 @@ public:
   }
   virtual int CheckerBoard(const Coordinate &site){
     int linear=0;
-    assert(site.size()==_ndimension);
+    GRID_ASSERT(site.size()==_ndimension);
     for(int d=0;d<_ndimension;d++){ 
       if(_checker_dim_mask[d])
 	linear=linear+site[d];
@@ -160,11 +160,11 @@ public:
 
       _isCheckerBoarded = true;
     _checker_dim = checker_dim;
-    assert(checker_dim_mask[checker_dim] == 1);
+    GRID_ASSERT(checker_dim_mask[checker_dim] == 1);
     _ndimension = dimensions.size();
-    assert(checker_dim_mask.size() == _ndimension);
-    assert(processor_grid.size() == _ndimension);
-    assert(simd_layout.size() == _ndimension);
+    GRID_ASSERT(checker_dim_mask.size() == _ndimension);
+    GRID_ASSERT(processor_grid.size() == _ndimension);
+    GRID_ASSERT(simd_layout.size() == _ndimension);
 
     _fdimensions.resize(_ndimension);
     _gdimensions.resize(_ndimension);
@@ -190,20 +190,20 @@ public:
 
         if (d == _checker_dim)
 	  {
-	    assert((_gdimensions[d] & 0x1) == 0);
+	    GRID_ASSERT((_gdimensions[d] & 0x1) == 0);
 	    _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
 	    _gsites /= 2;
 	  }
         _ldimensions[d] = _gdimensions[d] / _processors[d];
-        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
+        GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]);
         _lstart[d] = _processor_coor[d] * _ldimensions[d];
         _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
 
         // Use a reduced simd grid
         _simd_layout[d] = simd_layout[d];
         _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
-        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
-        assert(_rdimensions[d] > 0);
+        GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
+        GRID_ASSERT(_rdimensions[d] > 0);
 
         // all elements of a simd vector must have same checkerboard.
         // If Ls vectorised, this must still be the case; e.g. dwf rb5d
diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index 8fd8ec34..deb93fae 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -108,7 +108,7 @@ public:
   // very VERY rarely (Log, serial RNG) we need world without a grid
   ////////////////////////////////////////////////////////////////////////////////
   static int  RankWorld(void) ;
-  static void BroadcastWorld(int root,void* data, int bytes);
+  static void BroadcastWorld(int root,void* data, uint64_t bytes);
   static void BarrierWorld(void);
   
   ////////////////////////////////////////////////////////////
@@ -149,7 +149,7 @@ public:
 			    sizeof(obj),d*100+p);
 
       }
-      if (!list.empty()) // avoid triggering assert in comms == none
+      if (!list.empty()) // avoid triggering GRID_ASSERT in comms == none
 	CommsComplete(list);
       for(int p=1;p<_processors[d];p++){
 	accum = accum + column[p];
@@ -175,27 +175,27 @@ public:
 			   int dest,
 			   void *recv,
 			   int from,
-			   int bytes,int dir);
+			   uint64_t bytes,int dir);
   
   void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
 		      int recv_from_rank,
-		      int bytes);
+		      uint64_t bytes);
   
   int IsOffNode(int rank);
   double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,int do_xmit,
 			       void *recv,
 			       int recv_from_rank,int do_recv,
-			       int bytes,int dir);
+			       uint64_t bytes,int dir);
 
   double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 				      void *xmit,
 				      int xmit_to_rank,int do_xmit,
 				      void *recv,
 				      int recv_from_rank,int do_recv,
-				      int xbytes,int rbytes,int dir);
+				      uint64_t xbytes,uint64_t rbytes,int dir);
 
   // Could do a PollHtoD and have a CommsMerge dependence
   void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
@@ -206,7 +206,7 @@ public:
 				    int xmit_to_rank,int do_xmit,
 				    void *recv,void *recv_comp,
 				    int recv_from_rank,int do_recv,
-				    int xbytes,int rbytes,int dir);
+				    uint64_t xbytes,uint64_t rbytes,int dir);
   
   
   void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
@@ -220,20 +220,20 @@ public:
   ////////////////////////////////////////////////////////////
   // Broadcast a buffer and composite larger
   ////////////////////////////////////////////////////////////
-  void Broadcast(int root,void* data, int bytes);
+  void Broadcast(int root,void* data, uint64_t bytes);
 
   ////////////////////////////////////////////////////////////
   // All2All down one dimension
   ////////////////////////////////////////////////////////////
   template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
-    assert(dim>=0);
-    assert(dim<_ndimension);
-    assert(in.size()==out.size());
+    GRID_ASSERT(dim>=0);
+    GRID_ASSERT(dim<_ndimension);
+    GRID_ASSERT(in.size()==out.size());
     int numnode = _processors[dim];
     uint64_t bytes=sizeof(T);
     uint64_t words=in.size()/numnode;
-    assert(numnode * words == in.size());
-    assert(words < (1ULL<<31));
+    GRID_ASSERT(numnode * words == in.size());
+    GRID_ASSERT(words < (1ULL<<31));
     AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
   }
   void AllToAll(int dim  ,void *in,void *out,uint64_t words,uint64_t bytes);
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index 0f03aba5..3b8561d3 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -28,15 +28,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/communicator/SharedMemory.h>
 
+void GridAbort(void) { MPI_Abort(MPI_COMM_WORLD,SIGABRT); }
+extern void * Grid_backtrace_buffer[_NBACKTRACE];
+
 NAMESPACE_BEGIN(Grid);
 
 
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 #ifdef GRID_CHECKSUM_COMMS
-extern void * Grid_backtrace_buffer[_NBACKTRACE];
 uint64_t checksum_index = 1;
 #endif
 
+
 ////////////////////////////////////////////
 // First initialise of comms system
 ////////////////////////////////////////////
@@ -60,11 +63,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
 #endif
     //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
     if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
-      assert(0);
+      GRID_ASSERT(0);
     }
 
     if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
-      assert(0);
+      GRID_ASSERT(0);
     }
   }
 
@@ -85,20 +88,20 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
   int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
 {
   int rank;
   int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
   return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
 {
   coor.resize(_ndimension);
   int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -125,8 +128,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 //////////////////////////////////
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
 {
-  _ndimension = processors.size();  assert(_ndimension>=1);
-  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
+  _ndimension = processors.size();  GRID_ASSERT(_ndimension>=1);
+  int parent_ndimension = parent._ndimension; GRID_ASSERT(_ndimension >= parent._ndimension);
   Coordinate parent_processor_coor(_ndimension,0);
   Coordinate parent_processors    (_ndimension,1);
   Coordinate shm_processors       (_ndimension,1);
@@ -150,7 +153,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
     childsize *= processors[d];
   }
   int Nchild = Nparent/childsize;
-  assert (childsize * Nchild == Nparent);
+  GRID_ASSERT (childsize * Nchild == Nparent);
 
   Coordinate ccoor(_ndimension); // coor within subcommunicator
   Coordinate scoor(_ndimension); // coor of split within parent
@@ -176,12 +179,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
     // Split the communicator
     ////////////////////////////////////////////////////////////////
     int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
-    assert(ierr==0);
+    GRID_ASSERT(ierr==0);
 
   } else {
     srank = 0;
     int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
-    assert(ierr==0);
+    GRID_ASSERT(ierr==0);
   }
 
   //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -206,7 +209,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
     }
   }
   for(int d=0;d<processors.size();d++){
-    assert(_processor_coor[d] == ccoor[d] );
+    GRID_ASSERT(_processor_coor[d] == ccoor[d] );
   }
 }
 
@@ -248,7 +251,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors
   for(int i=0;i<_ndimension*2;i++){
     MPI_Comm_dup(communicator,&communicator_halo[i]);
   }
-  assert(Size==_Nprocessors);
+  GRID_ASSERT(Size==_Nprocessors);
 }
 
 CartesianCommunicator::~CartesianCommunicator()
@@ -276,62 +279,62 @@ void CartesianCommunicator::GlobalSum(double &d)
 void CartesianCommunicator::GlobalSum(float &f){
   FlightRecorder::StepLog("AllReduce float");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
   FlightRecorder::StepLog("AllReduce double");
   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
   FlightRecorder::StepLog("AllReduce uint32_t");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
   FlightRecorder::StepLog("AllReduce uint64_t");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
   FlightRecorder::StepLog("AllReduceVector");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
   FlightRecorder::StepLog("GlobalXOR");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(float &f)
 {
   FlightRecorder::StepLog("GlobalMax");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(double &d)
 {
   FlightRecorder::StepLog("GlobalMax");
   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
   FlightRecorder::StepLog("GlobalSumVector(float *)");
   int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
   FlightRecorder::StepLog("GlobalSumVector(double *)");
   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
@@ -339,24 +342,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &
 						int dest,
 						void *recv,
 						int from,
-						int bytes,int dir)
+						uint64_t bytes,int dir)
 {
   MPI_Request xrq;
   MPI_Request rrq;
 
-  assert(dest != _processor);
-  assert(from != _processor);
-
+  GRID_ASSERT(dest != _processor);
+  GRID_ASSERT(from != _processor);
   int tag;
 
   tag= dir+from*32;
-  int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
-  assert(ierr==0);
+  int ierr=MPI_Irecv(recv,(int)( bytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator,&rrq);
+  GRID_ASSERT(ierr==0);
   list.push_back(rrq);
   
   tag= dir+_processor*32;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
-  assert(ierr==0);
+  ierr =MPI_Isend(xmit,(int)(bytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator,&xrq);
+  GRID_ASSERT(ierr==0);
   list.push_back(xrq);
 }
 void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
@@ -367,7 +369,7 @@ void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
 
   std::vector<MPI_Status> status(nreq);
   int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
   list.resize(0);
 }
 
@@ -376,7 +378,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
-					   int bytes)
+					   uint64_t bytes)
 {
   std::vector<MpiCommsRequest_t> reqs(0);
 
@@ -384,15 +386,15 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
   int ierr;
 
   // Enforce no UVM in comms, device or host OK
-  assert(acceleratorIsCommunicable(xmit));
-  assert(acceleratorIsCommunicable(recv));
+  GRID_ASSERT(acceleratorIsCommunicable(xmit));
+  GRID_ASSERT(acceleratorIsCommunicable(recv));
 
   // Give the CPU to MPI immediately; can use threads to overlap optionally
   //  printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
-  ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
-		    recv,bytes,MPI_CHAR,from, from,
+  ierr=MPI_Sendrecv(xmit,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,dest,myrank,
+		    recv,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,from, from,
 		    communicator,MPI_STATUS_IGNORE);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 
 }
 // Basic Halo comms primitive
@@ -400,7 +402,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest, int dox,
 						     void *recv,
 						     int from, int dor,
-						     int bytes,int dir)
+						     uint64_t bytes,int dir)
 {
   std::vector<CommsRequest_t> list;
   double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
@@ -423,7 +425,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
 							   int dest,int dox,
 							   void *recv,
 							   int from,int dor,
-							   int xbytes,int rbytes,int dir)
+							   uint64_t xbytes,uint64_t rbytes,int dir)
 {
   return 0.0; // Do nothing -- no preparation required
 }
@@ -432,7 +434,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int dest,int dox,
 							 void *recv,void *recv_comp,
 							 int from,int dor,
-							 int xbytes,int rbytes,int dir)
+							 uint64_t xbytes,uint64_t rbytes,int dir)
 {
   int ncomm  =communicator_halo.size();
   int commdir=dir%ncomm;
@@ -445,9 +447,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   int gfrom = ShmRanks[from];
   int gme   = ShmRanks[_processor];
 
-  assert(dest != _processor);
-  assert(from != _processor);
-  assert(gme  == ShmRank);
+  GRID_ASSERT(dest != _processor);
+  GRID_ASSERT(from != _processor);
+  GRID_ASSERT(gme  == ShmRank);
   double off_node_bytes=0.0;
   int tag;
   
@@ -455,15 +457,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
     if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
       tag= dir+from*32;
       //      std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
-      ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      assert(ierr==0);
+      ierr=MPI_Irecv(recv_comp,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
+      GRID_ASSERT(ierr==0);
       list.push_back(rrq);
       off_node_bytes+=rbytes;
     }
 #ifdef NVLINK_GET
     else { 
       void *shm = (void *) this->ShmBufferTranslate(from,xmit);
-      assert(shm!=NULL);
+      GRID_ASSERT(shm!=NULL);
       //      std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
       acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
     }
@@ -473,14 +475,14 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   if (dox) {
     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
       tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
+      ierr =MPI_Isend(xmit_comp,(int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
+      GRID_ASSERT(ierr==0);
       list.push_back(xrq);
       off_node_bytes+=xbytes;
     } else {
 #ifndef NVLINK_GET
       void *shm = (void *) this->ShmBufferTranslate(dest,recv);
-      assert(shm!=NULL);
+      GRID_ASSERT(shm!=NULL);
       acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
 #endif
     }
@@ -497,7 +499,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
   if (nreq==0) return;
   std::vector<MPI_Status> status(nreq);
   int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
   list.resize(0);
   this->StencilBarrier(); 
 }
@@ -540,7 +542,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
 							   int dest,int dox,
 							   void *recv,
 							   int from,int dor,
-							   int xbytes,int rbytes,int dir)
+							   uint64_t xbytes,uint64_t rbytes,int dir)
 {
 /*
  * Bring sequence from Stencil.h down to lower level.
@@ -557,9 +559,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
   int gfrom = ShmRanks[from];
   int gme   = ShmRanks[_processor];
 
-  assert(dest != _processor);
-  assert(from != _processor);
-  assert(gme  == ShmRank);
+  GRID_ASSERT(dest != _processor);
+  GRID_ASSERT(from != _processor);
+  GRID_ASSERT(gme  == ShmRank);
   double off_node_bytes=0.0;
   int tag;
 
@@ -581,8 +583,8 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
     if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
       tag= dir+from*32;
       host_recv = this->HostBufferMalloc(rbytes);
-      ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
-      assert(ierr==0);
+      ierr=MPI_Irecv(host_recv,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
+      GRID_ASSERT(ierr==0);
       CommsRequest_t srq;
       srq.PacketType = InterNodeRecv;
       srq.bytes      = rbytes;
@@ -606,7 +608,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
 #ifdef GRID_CHECKSUM_COMMS
       uint64_t xbytes_data = xbytes - 8;
       srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
-      assert(xbytes % 8 == 0);
+      GRID_ASSERT(xbytes % 8 == 0);
       // flip one bit so that a zero buffer is not consistent
       uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag); 
       *(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
@@ -615,7 +617,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
 #endif
       
       //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      //      assert(ierr==0);
+      //      GRID_ASSERT(ierr==0);
       //      off_node_bytes+=xbytes;
 
       srq.PacketType = InterNodeXmit;
@@ -683,7 +685,7 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
 	if ( acceleratorEventIsComplete(list[idx].ev) ) {
 
 	  void *host_xmit = list[idx].host_buf;
-	  uint32_t xbytes = list[idx].bytes;
+	  uint64_t xbytes = list[idx].bytes;
 	  int dest        = list[idx].dest;
 	  int tag         = list[idx].tag;
 	  int commdir     = list[idx].commdir;
@@ -694,8 +696,8 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
 	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
 	  
 	  MPI_Request xrq;
-	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-	  assert(ierr==0);
+	  int ierr =MPI_Isend(host_xmit, (int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
+	  GRID_ASSERT(ierr==0);
 
 	  list[idx].req        = xrq; // Update the MPI request in the list
 
@@ -715,7 +717,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 							 int dest,int dox,
 							 void *recv,void *recv_comp,
 							 int from,int dor,
-							 int xbytes,int rbytes,int dir)
+							 uint64_t xbytes,uint64_t rbytes,int dir)
 {
   int ncomm  =communicator_halo.size();
   int commdir=dir%ncomm;
@@ -728,9 +730,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   int gfrom = ShmRanks[from];
   int gme   = ShmRanks[_processor];
 
-  assert(dest != _processor);
-  assert(from != _processor);
-  assert(gme  == ShmRank);
+  GRID_ASSERT(dest != _processor);
+  GRID_ASSERT(from != _processor);
+  GRID_ASSERT(gme  == ShmRank);
   double off_node_bytes=0.0;
   int tag;
 
@@ -751,7 +753,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
     if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
       // Intranode
       void *shm = (void *) this->ShmBufferTranslate(from,xmit);
-      assert(shm!=NULL);
+      GRID_ASSERT(shm!=NULL);
 
       CommsRequest_t srq;
 
@@ -774,7 +776,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
     if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
       // Intranode
       void *shm = (void *) this->ShmBufferTranslate(dest,recv);
-      assert(shm!=NULL);
+      GRID_ASSERT(shm!=NULL);
 
       CommsRequest_t srq;
       
@@ -813,7 +815,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
   if (nreq>0) {
     status.resize(MpiRequests.size());
     int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
-    assert(ierr==0);
+    GRID_ASSERT(ierr==0);
   }
   
   //  for(int r=0;r<nreq;r++){
@@ -879,17 +881,17 @@ void CartesianCommunicator::Barrier(void)
 {
   FlightRecorder::StepLog("GridBarrier");
   int ierr = MPI_Barrier(communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+void CartesianCommunicator::Broadcast(int root,void* data,uint64_t bytes)
 {
   FlightRecorder::StepLog("Broadcast");
   int ierr=MPI_Bcast(data,
-		     bytes,
+		     (int)bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 int CartesianCommunicator::RankWorld(void){
   int r;
@@ -899,23 +901,23 @@ int CartesianCommunicator::RankWorld(void){
 void CartesianCommunicator::BarrierWorld(void){
   FlightRecorder::StepLog("BarrierWorld");
   int ierr = MPI_Barrier(communicator_world);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
+void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes)
 {
   FlightRecorder::StepLog("BroadcastWorld");
   int ierr= MPI_Bcast(data,
-		      bytes,
+		      (int)bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
   Coordinate row(_ndimension,1);
-  assert(dim>=0 && dim<_ndimension);
+  GRID_ASSERT(dim>=0 && dim<_ndimension);
 
   //  Split the communicator
   row[dim] = _processors[dim];
@@ -936,8 +938,8 @@ void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t
   int ibytes;
   iwords = words;
   ibytes = bytes;
-  assert(words == iwords); // safe to cast to int ?
-  assert(bytes == ibytes); // safe to cast to int ?
+  GRID_ASSERT(words == iwords); // safe to cast to int ?
+  GRID_ASSERT(bytes == ibytes); // safe to cast to int ?
   MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
   MPI_Type_commit(&object);
   MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
index 350b967c..a7232fcd 100644
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -27,6 +27,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 
+void GridAbort(void) { abort(); }
+
 NAMESPACE_BEGIN(Grid);
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -34,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 
+
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
   GlobalSharedMemory::Init(communicator_world);
@@ -54,14 +57,14 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 {
   _shm_processors = Coordinate(processors.size(),1);
   _processors = processors;
-  _ndimension = processors.size();  assert(_ndimension>=1);
+  _ndimension = processors.size();  GRID_ASSERT(_ndimension>=1);
   _processor_coor.resize(_ndimension);
   
   // Require 1^N processor grid for fake
   _Nprocessors=1;
   _processor = 0;
   for(int d=0;d<_ndimension;d++) {
-    assert(_processors[d]==1);
+    GRID_ASSERT(_processors[d]==1);
     _processor_coor[d] = 0;
   }
   SetCommunicator(communicator_world);
@@ -87,19 +90,19 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
-					   int bytes)
+					   uint64_t bytes)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
-void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ GRID_ASSERT(list.size()==0);}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
-						int bytes,int dir)
+						uint64_t bytes,int dir)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
@@ -113,8 +116,8 @@ void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t
 
 int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
+void CartesianCommunicator::Broadcast(int root,void* data, uint64_t bytes) {}
+void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes) { }
 void CartesianCommunicator::BarrierWorld(void) { }
 int  CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) {  return 0;}
 void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){  coor = _processor_coor; }
@@ -130,7 +133,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,int dox,
 						     void *recv,
 						     int recv_from_rank,int dor,
-						     int bytes, int dir)
+						     uint64_t bytes, int dir)
 {
   return 2.0*bytes;
 }
@@ -141,16 +144,16 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
 							   int xmit_to_rank,int dox,
 							   void *recv,
 							   int recv_from_rank,int dor,
-							   int xbytes,int rbytes, int dir)
+							   uint64_t xbytes,uint64_t rbytes, int dir)
 {
   return 0.0;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit, void *xmit_comp,
 							 int xmit_to_rank,int dox,
-							 void *recv,
+							 void *recv, void *recv_comp,
 							 int recv_from_rank,int dor,
-							 int xbytes,int rbytes, int dir)
+							 uint64_t xbytes,uint64_t rbytes, int dir)
 {
   return xbytes+rbytes;
 }
diff --git a/Grid/communicator/SharedMemory.cc b/Grid/communicator/SharedMemory.cc
index 3445b077..662f319f 100644
--- a/Grid/communicator/SharedMemory.cc
+++ b/Grid/communicator/SharedMemory.cc
@@ -58,8 +58,8 @@ int                 GlobalSharedMemory::WorldNode;
 
 void GlobalSharedMemory::SharedMemoryFree(void)
 {
-  assert(_ShmAlloc);
-  assert(_ShmAllocBytes>0);
+  GRID_ASSERT(_ShmAlloc);
+  GRID_ASSERT(_ShmAllocBytes>0);
   for(int r=0;r<WorldShmSize;r++){
     munmap(WorldShmCommBufs[r],_ShmAllocBytes);
   }
@@ -80,7 +80,7 @@ void *SharedMemory::HostBufferMalloc(size_t bytes){
     std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
     std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
     std::cout<< " Current heap  is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
-    assert(host_heap_bytes<host_heap_size);
+    GRID_ASSERT(host_heap_bytes<host_heap_size);
   }
   return ptr;
 }
@@ -100,7 +100,7 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
     std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
     std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
     std::cout<< " Current heap  is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
-    assert(heap_bytes<heap_size);
+    GRID_ASSERT(heap_bytes<heap_size);
   }
   //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
   return ptr;
@@ -127,13 +127,13 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
   if ( str ) {
     std::vector<int> IntShmDims;
     GridCmdOptionIntVector(std::string(str),IntShmDims);
-    assert(IntShmDims.size() == WorldDims.size());
+    GRID_ASSERT(IntShmDims.size() == WorldDims.size());
     long ShmSize = 1;
     for (int dim=0;dim<WorldDims.size();dim++) {
       ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
-      assert(divides(ShmDims[dim],WorldDims[dim]));
+      GRID_ASSERT(divides(ShmDims[dim],WorldDims[dim]));
     }
-    assert(ShmSize == WorldShmSize);
+    GRID_ASSERT(ShmSize == WorldShmSize);
     return;
   }
   
diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc
index 6ef950d5..213543cc 100644
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -67,7 +67,7 @@ public:
   {
     int errnum;
 
-    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  assert(sock>0);
+    sock = socket(AF_UNIX, SOCK_DGRAM, 0);  GRID_ASSERT(sock>0);
 
     struct sockaddr_un sa_un = { 0 };
     sa_un.sun_family = AF_UNIX;
@@ -158,7 +158,7 @@ public:
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
-  assert(_ShmSetup==0);
+  GRID_ASSERT(_ShmSetup==0);
   WorldComm = comm;
   MPI_Comm_rank(WorldComm,&WorldRank);
   MPI_Comm_size(WorldComm,&WorldSize);
@@ -184,7 +184,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 
   // WorldNodes
   WorldNodes = WorldSize/WorldShmSize;
-  assert( (WorldNodes * WorldShmSize) == WorldSize );
+  GRID_ASSERT( (WorldNodes * WorldShmSize) == WorldSize );
 
 
   // FIXME: Check all WorldShmSize are the same ?
@@ -209,7 +209,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
   MyGroup.resize(WorldShmSize);
   for(int rank=0;rank<WorldSize;rank++){
     if(WorldShmRanks[rank]!=MPI_UNDEFINED){
-      assert(g<WorldShmSize);
+      GRID_ASSERT(g<WorldShmSize);
       MyGroup[g++] = rank;
     }
   }
@@ -225,7 +225,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
   // global sum leaders over comm world
   ///////////////////////////////////////////////////////////////////
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 
   ///////////////////////////////////////////////////////////////////
   // find the group leaders world rank
@@ -246,7 +246,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
       WorldNode=g;
     }
   }
-  assert(WorldNode!=-1);
+  GRID_ASSERT(WorldNode!=-1);
   _ShmSetup=1;
 }
 // Gray encode support 
@@ -288,7 +288,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
   // Assert power of two shm_size.
   ////////////////////////////////////////////////////////////////
   int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
-  assert(log2size != -1);
+  GRID_ASSERT(log2size != -1);
 
   ////////////////////////////////////////////////////////////////
   // Identify the hypercube coordinate of this node using hostname
@@ -309,7 +309,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
   // Parse ICE-XA hostname to get hypercube location
   gethostname(name,namelen);
   int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
-  assert(nscan==3);
+  GRID_ASSERT(nscan==3);
 
   int nlo = N%9;
   int nhi = N/9;
@@ -333,8 +333,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
   //////////////////////////////////////////////////////////////////
   MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); 
   hypercoor=hypercoor-rootcoor;
-  assert(hypercoor<WorldSize);
-  assert(hypercoor>=0);
+  GRID_ASSERT(hypercoor<WorldSize);
+  GRID_ASSERT(hypercoor>=0);
 
   //////////////////////////////////////
   // Printing
@@ -382,7 +382,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
   for(int i=0;i<ndimension;i++){
     Nprocessors*=processors[i];
   }
-  assert(WorldSize==Nprocessors);
+  GRID_ASSERT(WorldSize==Nprocessors);
 
   ////////////////////////////////////////////////////////////////
   // Establish mapping between lexico physics coord and WorldRank
@@ -401,7 +401,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
   // Build the new communicator
   /////////////////////////////////////////////////////////////////
   int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
 {
@@ -431,7 +431,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
   for(int i=0;i<ndimension;i++){
     Nprocessors*=processors[i];
   }
-  assert(WorldSize==Nprocessors);
+  //  std::cerr << " WorldSize "<<WorldSize << " Nprocessors "<<Nprocessors<<" "<<processors<<std::endl; 
+  GRID_ASSERT(WorldSize==Nprocessors);
 
   ////////////////////////////////////////////////////////////////
   // Establish mapping between lexico physics coord and WorldRank
@@ -447,7 +448,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
   // Build the new communicator
   /////////////////////////////////////////////////////////////////
   int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
-  assert(ierr==0);
+  GRID_ASSERT(ierr==0);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // SHMGET
@@ -456,8 +457,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
+  GRID_ASSERT(_ShmSetup==1);
+  GRID_ASSERT(_ShmAlloc==0);
 
   //////////////////////////////////////////////////////////////////////////////////////////////////////////
   // allocate the shared windows for our group
@@ -518,8 +519,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
   void * ShmCommBuf ; 
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
+  GRID_ASSERT(_ShmSetup==1);
+  GRID_ASSERT(_ShmAlloc==0);
 
   //////////////////////////////////////////////////////////////////////////////////////////////////////////
   // allocate the pointer array for shared windows for our group
@@ -628,7 +629,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 			 MPI_BYTE,
 			 r,
 			 WorldShmComm);
-      assert(ierr==0);
+      GRID_ASSERT(ierr==0);
     }
     
     ///////////////////////////////////////////////////////////////
@@ -667,7 +668,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
       }
-      assert(thisBuf!=nullptr);
+      GRID_ASSERT(thisBuf!=nullptr);
     }
 #endif
 #ifdef GRID_CUDA
@@ -708,8 +709,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
+  GRID_ASSERT(_ShmSetup==1);
+  GRID_ASSERT(_ShmAlloc==0);
   //////////////////////////////////////////////////////////////////////////////////////////////////////////
   // allocate the shared windows for our group
   //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -739,9 +740,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
     void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
     if ( ptr == (void *)MAP_FAILED ) {    
       printf("mmap %s failed\n",shm_name);
-      perror("failed mmap");      assert(0);    
+      perror("failed mmap");      GRID_ASSERT(0);    
     }
-    assert(((uint64_t)ptr&0x3F)==0);
+    GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
     close(fd);
     WorldShmCommBufs[r] =ptr;
     //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
@@ -756,8 +757,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
+  GRID_ASSERT(_ShmSetup==1);
+  GRID_ASSERT(_ShmAlloc==0);
   //////////////////////////////////////////////////////////////////////////////////////////////////////////
   // allocate the shared windows for our group
   //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -768,7 +769,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
   // Hugetlbf and others map filesystems as mappable huge pages
   ////////////////////////////////////////////////////////////////////////////////////////////
   char shm_name [NAME_MAX];
-  assert(WorldShmSize == 1);
+  GRID_ASSERT(WorldShmSize == 1);
   for(int r=0;r<WorldShmSize;r++){
     
     int fd=-1;
@@ -782,9 +783,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
     void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
     if ( ptr == (void *)MAP_FAILED ) {    
       printf("mmap %s failed\n",shm_name);
-      perror("failed mmap");      assert(0);    
+      perror("failed mmap");      GRID_ASSERT(0);    
     }
-    assert(((uint64_t)ptr&0x3F)==0);
+    GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
     close(fd);
     WorldShmCommBufs[r] =ptr;
     //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
@@ -803,8 +804,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
   std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0); 
+  GRID_ASSERT(_ShmSetup==1);
+  GRID_ASSERT(_ShmAlloc==0); 
   MPI_Barrier(WorldShmComm);
   WorldShmCommBufs.resize(WorldShmSize);
 
@@ -835,7 +836,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	perror("failed mmap");     
 	assert(0);    
       }
-      assert(((uint64_t)ptr&0x3F)==0);
+      GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
       
       WorldShmCommBufs[r] =ptr;
       close(fd);
@@ -856,8 +857,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
       if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
       
       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
-      assert(((uint64_t)ptr&0x3F)==0);
+      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      GRID_ASSERT(0);    }
+      GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
       WorldShmCommBufs[r] =ptr;
 
       close(fd);
@@ -914,7 +915,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
   //////////////////////////////////////////////////////////////////////
   // Map ShmRank to WorldShmRank and use the right buffer
   //////////////////////////////////////////////////////////////////////
-  assert (GlobalSharedMemory::ShmAlloc()==1);
+  GRID_ASSERT (GlobalSharedMemory::ShmAlloc()==1);
   heap_size = GlobalSharedMemory::ShmAllocBytes();
   for(int r=0;r<ShmSize;r++){
 
@@ -982,9 +983,9 @@ void SharedMemory::SharedMemoryTest(void)
   ShmBarrier();
   for(uint64_t r=0;r<ShmSize;r++){
     acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
-    assert(check[0]==GlobalSharedMemory::WorldNode);
-    assert(check[1]==r);
-    assert(check[2]==magic);
+    GRID_ASSERT(check[0]==GlobalSharedMemory::WorldNode);
+    GRID_ASSERT(check[1]==r);
+    GRID_ASSERT(check[2]==magic);
   }
   ShmBarrier();
   std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
@@ -1002,7 +1003,7 @@ void *SharedMemory::ShmBuffer(int rank)
 void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
   int gpeer = ShmRanks[rank];
-  assert(gpeer!=ShmRank); // never send to self
+  GRID_ASSERT(gpeer!=ShmRank); // never send to self
   //  std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
   if (gpeer == MPI_UNDEFINED){
     return NULL;
diff --git a/Grid/communicator/SharedMemoryNone.cc b/Grid/communicator/SharedMemoryNone.cc
index dc8274f3..8ca7aeda 100644
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -34,7 +34,7 @@ NAMESPACE_BEGIN(Grid);
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
-  assert(_ShmSetup==0);
+  GRID_ASSERT(_ShmSetup==0);
   WorldComm = 0;
   WorldRank = 0;
   WorldSize = 1;
@@ -62,8 +62,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
   std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
   void * ShmCommBuf ; 
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
+  GRID_ASSERT(_ShmSetup==1);
+  GRID_ASSERT(_ShmAlloc==0);
 
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
   // Each MPI rank should allocate our own buffer
@@ -92,8 +92,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
   void * ShmCommBuf ; 
-  assert(_ShmSetup==1);
-  assert(_ShmAlloc==0);
+  GRID_ASSERT(_ShmSetup==1);
+  GRID_ASSERT(_ShmAlloc==0);
   int mmap_flag =0;
 #ifdef MAP_ANONYMOUS
   mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
@@ -132,7 +132,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 ////////////////////////////////////////////////////////
 void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 {
-  assert(GlobalSharedMemory::ShmAlloc()==1);
+  GRID_ASSERT(GlobalSharedMemory::ShmAlloc()==1);
   ShmRanks.resize(1);
   ShmCommBufs.resize(1);
   ShmRanks[0] = 0;
diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h
index b8099c27..fa3f27a5 100644
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -240,9 +240,9 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 
     // Case of SIMD split AND checker dim cannot currently be hit, except in 
     // Test_cshift_red_black code.
-    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
+    std::cout << "Scatter_plane merge GRID_ASSERT(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
     std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
-    assert(0); // This will fail if hit on GPU
+    GRID_ASSERT(0); // This will fail if hit on GPU
     autoView( rhs_v, rhs, CpuWrite);
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index a66a0420..252478ad 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -49,6 +49,20 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
   // Map to always positive shift modulo global full dimension.
   shift = (shift+fd)%fd;
 
+  if( shift ==0 ) {
+    ret = rhs;
+    return ret;
+  }
+  //
+  // Potential easy fast cases:
+  // Shift is a multiple of the local lattice extent.
+  // Then need only to shift whole subvolumes
+  int L = rhs.Grid()->_ldimensions[dimension];
+  if ( (shift%L )==0 && !rhs.Grid()->CheckerBoarded(dimension) ) {
+    Cshift_simple(ret,rhs,dimension,shift);
+    return ret;
+  }
+  
   ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
         
   // the permute type
@@ -73,6 +87,55 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
   return ret;
 }
 
+template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
+{
+  GridBase *grid=rhs.Grid();
+  int comm_proc, xmit_to_rank, recv_from_rank;
+  
+  int fd              = rhs.Grid()->_fdimensions[dimension];
+  int rd              = rhs.Grid()->_rdimensions[dimension];
+  int ld              = rhs.Grid()->_ldimensions[dimension];
+  int pd              = rhs.Grid()->_processors[dimension];
+  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
+  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
+
+  comm_proc = ((shift)/ld)%pd;
+
+  grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+  if(comm_dim) {
+
+    int64_t bytes = sizeof(vobj) * grid->oSites();
+
+    autoView(rhs_v , rhs, AcceleratorRead);
+    autoView(ret_v , ret, AcceleratorWrite);
+    void *send_buf  = (void *)&rhs_v[0];
+    void *recv_buf  = (void *)&ret_v[0];
+
+#ifdef ACCELERATOR_AWARE_MPI
+    grid->SendToRecvFrom(send_buf,
+			 xmit_to_rank,
+			 recv_buf,
+			 recv_from_rank,
+			 bytes);
+#else
+    static hostVector<vobj> hrhs; hrhs.resize(grid->oSites());
+    static hostVector<vobj> hret; hret.resize(grid->oSites());
+
+    void *hsend_buf = (void *)&hrhs[0];
+    void *hrecv_buf = (void *)&hret[0];
+
+    acceleratorCopyFromDevice(send_buf,hsend_buf,bytes);
+
+    grid->SendToRecvFrom(hsend_buf,
+			 xmit_to_rank,
+			 hrecv_buf,
+			 recv_from_rank,
+			 bytes);
+
+    acceleratorCopyToDevice(hrecv_buf,recv_buf,bytes);
+#endif
+  }
+}
 template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
 {
   int sshift[2];
@@ -121,10 +184,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
   int pd              = rhs.Grid()->_processors[dimension];
   int simd_layout     = rhs.Grid()->_simd_layout[dimension];
   int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
-  assert(simd_layout==1);
-  assert(comm_dim==1);
-  assert(shift>=0);
-  assert(shift<fd);
+  GRID_ASSERT(simd_layout==1);
+  GRID_ASSERT(comm_dim==1);
+  GRID_ASSERT(shift>=0);
+  GRID_ASSERT(shift<fd);
   
   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
   static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
@@ -187,7 +250,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
       acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
 
 #ifdef GRID_CHECKSUM_COMMS
-      assert(bytes % 8 == 0);
+      GRID_ASSERT(bytes % 8 == 0);
       checksum_index++;
       uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
       *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
@@ -213,7 +276,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 		<<" send "<<xsum<<" to   "<<xmit_to_rank
 		<<" recv "<<computed_cs<<" from "<<recv_from_rank
 		<<std::endl;
-      assert(expected_cs == computed_cs);
+      GRID_ASSERT(expected_cs == computed_cs);
 #else
       acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
 #endif
@@ -259,10 +322,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
   //	    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
   //	    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
 
-  assert(comm_dim==1);
-  assert(simd_layout==2);
-  assert(shift>=0);
-  assert(shift<fd);
+  GRID_ASSERT(comm_dim==1);
+  GRID_ASSERT(simd_layout==2);
+  GRID_ASSERT(shift>=0);
+  GRID_ASSERT(shift<fd);
 
   RealD tcopy=0.0;
   RealD tgather=0.0;
@@ -341,7 +404,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 
       if (nbr_ic) nbr_lane|=inner_bit;
 
-      assert (sx == nbr_ox);
+      GRID_ASSERT (sx == nbr_ox);
 
       if(nbr_proc){
 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h
index 2ad93d30..0bcad224 100644
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -245,7 +245,7 @@ template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * =
 inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
 {
   if ((cb == Odd) || (cb == Even)) {
-    assert(cb == lat.Checkerboard());
+    GRID_ASSERT(cb == lat.Checkerboard());
   }
   cb = lat.Checkerboard();
 }
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index d8b9a618..7044f4a6 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -120,12 +120,12 @@ public:
     GRID_TRACE("ExpressionTemplateEval");
     GridBase *egrid(nullptr);
     GridFromExpression(egrid,expr);
-    assert(egrid!=nullptr);
+    GRID_ASSERT(egrid!=nullptr);
     conformable(this->_grid,egrid);
 
     int cb=-1;
     CBFromExpression(cb,expr);
-    assert( (cb==Odd) || (cb==Even));
+    GRID_ASSERT( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
     
     auto exprCopy = expr;
@@ -144,12 +144,12 @@ public:
     GRID_TRACE("ExpressionTemplateEval");
     GridBase *egrid(nullptr);
     GridFromExpression(egrid,expr);
-    assert(egrid!=nullptr);
+    GRID_ASSERT(egrid!=nullptr);
     conformable(this->_grid,egrid);
 
     int cb=-1;
     CBFromExpression(cb,expr);
-    assert( (cb==Odd) || (cb==Even));
+    GRID_ASSERT( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
 
     auto exprCopy = expr;
@@ -168,12 +168,12 @@ public:
     GRID_TRACE("ExpressionTemplateEval");
     GridBase *egrid(nullptr);
     GridFromExpression(egrid,expr);
-    assert(egrid!=nullptr);
+    GRID_ASSERT(egrid!=nullptr);
     conformable(this->_grid,egrid);
 
     int cb=-1;
     CBFromExpression(cb,expr);
-    assert( (cb==Odd) || (cb==Even));
+    GRID_ASSERT( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
     auto exprCopy = expr;
     ExpressionViewOpen(exprCopy);
@@ -191,11 +191,11 @@ public:
   Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
     this->_grid = nullptr;
     GridFromExpression(this->_grid,expr);
-    assert(this->_grid!=nullptr);
+    GRID_ASSERT(this->_grid!=nullptr);
 
     int cb=-1;
     CBFromExpression(cb,expr);
-    assert( (cb==Odd) || (cb==Even));
+    GRID_ASSERT( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
 
     resize(this->_grid->oSites());
@@ -206,11 +206,11 @@ public:
   Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
     this->_grid = nullptr;
     GridFromExpression(this->_grid,expr);
-    assert(this->_grid!=nullptr);
+    GRID_ASSERT(this->_grid!=nullptr);
 
     int cb=-1;
     CBFromExpression(cb,expr);
-    assert( (cb==Odd) || (cb==Even));
+    GRID_ASSERT( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
 
     resize(this->_grid->oSites());
@@ -221,11 +221,11 @@ public:
   Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
     this->_grid = nullptr;
     GridFromExpression(this->_grid,expr);
-    assert(this->_grid!=nullptr);
+    GRID_ASSERT(this->_grid!=nullptr);
 
     int cb=-1;
     CBFromExpression(cb,expr);
-    assert( (cb==Odd) || (cb==Even));
+    GRID_ASSERT( (cb==Odd) || (cb==Even));
     this->checkerboard=cb;
 
     resize(this->_grid->oSites());
@@ -264,7 +264,7 @@ public:
   Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { 
     this->_grid = grid;
     resize(this->_grid->oSites());
-    assert((((uint64_t)&this->_odata[0])&0xF) ==0);
+    GRID_ASSERT((((uint64_t)&this->_odata[0])&0xF) ==0);
     this->checkerboard=0;
     SetViewMode(mode);
   }
diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h
index c9c65928..0d556bc0 100644
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -166,9 +166,9 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
 {
   int vlen = idx.size();
 
-  assert(vlen>=1);
-  assert(vlen<=sort_vals.size());
-  assert(vlen<=_v.size());
+  GRID_ASSERT(vlen>=1);
+  GRID_ASSERT(vlen<=sort_vals.size());
+  GRID_ASSERT(vlen<=_v.size());
 
   for (size_t i=0;i<vlen;i++) {
 
@@ -186,7 +186,7 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
 	if (idx[j]==i)
 	  break;
 
-      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
+      GRID_ASSERT(idx[i] > i);     GRID_ASSERT(j!=idx.size());      GRID_ASSERT(idx[j]==i);
 
       swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
       std::swap(sort_vals[i],sort_vals[idx[i]]);
@@ -224,7 +224,7 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
 template<class Field>
 void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
   result = Zero();
-  assert(_v.size()==eval.size());
+  GRID_ASSERT(_v.size()==eval.size());
   int N = (int)_v.size();
   for (int i=0;i<N;i++) {
     Field& tmp = _v[i];
diff --git a/Grid/lattice/Lattice_conformable.h b/Grid/lattice/Lattice_conformable.h
index ce22685e..0dddf445 100644
--- a/Grid/lattice/Lattice_conformable.h
+++ b/Grid/lattice/Lattice_conformable.h
@@ -32,8 +32,8 @@ NAMESPACE_BEGIN(Grid);
 
 template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
 {
-  assert(lhs.Grid() == rhs.Grid());
-  assert(lhs.Checkerboard() == rhs.Checkerboard());
+  GRID_ASSERT(lhs.Grid() == rhs.Grid());
+  GRID_ASSERT(lhs.Checkerboard() == rhs.Checkerboard());
 }
 
 NAMESPACE_END(Grid);
diff --git a/Grid/lattice/Lattice_matrix_reduction.h b/Grid/lattice/Lattice_matrix_reduction.h
index abebbfd6..cb3600fe 100644
--- a/Grid/lattice/Lattice_matrix_reduction.h
+++ b/Grid/lattice/Lattice_matrix_reduction.h
@@ -42,7 +42,7 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
   //  Lattice<vobj> Xslice(SliceGrid);
   //  Lattice<vobj> Rslice(SliceGrid);
 
-  assert( FullGrid->_simd_layout[Orthog]==1);
+  GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
@@ -86,7 +86,7 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
   int Nblock = X.Grid()->GlobalDimensions()[Orthog];
 
   GridBase *FullGrid  = X.Grid();
-  assert( FullGrid->_simd_layout[Orthog]==1);
+  GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
@@ -140,7 +140,7 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
   
   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 
-  assert( FullGrid->_simd_layout[Orthog]==1);
+  GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
   //  int nh =  FullGrid->_ndimension;
   //  int nl = SliceGrid->_ndimension;
   //  int nl = nh-1;
diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h
index 6106962c..66fd6bea 100644
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -98,8 +98,8 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 
   int Nsimd = grid->Nsimd();
 
-  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
-  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+  GRID_ASSERT( l.Checkerboard()== l.Grid()->CheckerBoard(site));
+  GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
 
   int rank,odx,idx;
   // Optional to broadcast from node 0.
@@ -135,7 +135,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
 
   int Nsimd = grid->Nsimd();
 
-  assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
+  GRID_ASSERT( l.Checkerboard() == l.Grid()->CheckerBoard(site));
 
   int rank,odx,idx;
   grid->GlobalCoorToRankIndex(rank,odx,idx,site);
@@ -159,14 +159,14 @@ template<class vobj,class sobj>
 inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
 {
   GridBase *grid = l.getGrid();
-  assert(l.mode==CpuRead);
+  GRID_ASSERT(l.mode==CpuRead);
   typedef typename vobj::scalar_type scalar_type;
   typedef typename vobj::vector_type vector_type;
 
   int Nsimd = grid->Nsimd();
 
-  //  assert( l.Checkerboard()== grid->CheckerBoard(site));
-  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+  //  GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site));
+  GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
 
   static const int words=sizeof(vobj)/sizeof(vector_type);
   int odx,idx;
@@ -195,15 +195,15 @@ template<class vobj,class sobj>
 inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
 {
   GridBase *grid=l.getGrid();
-  assert(l.mode==CpuWrite);
+  GRID_ASSERT(l.mode==CpuWrite);
 
   typedef typename vobj::scalar_type scalar_type;
   typedef typename vobj::vector_type vector_type;
 
   int Nsimd = grid->Nsimd();
 
-  //  assert( l.Checkerboard()== grid->CheckerBoard(site));
-  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+  //  GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site));
+  GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
 
   static const int words=sizeof(vobj)/sizeof(vector_type);
   int odx,idx;
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 691faee6..861f3f06 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -292,26 +292,26 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
 
   bool ok;
 #ifdef GRID_SYCL
-  uint64_t csum=0;
-  uint64_t csum2=0;
-  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
-  {
-    // Hack
-    // Fast integer xor checksum. Can also be used in comms now.
-    autoView(l_v,left,AcceleratorRead);
-    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
-    uint64_t *base= (uint64_t *)&l_v[0];
-    csum=svm_xor(base,words);
-    ok = FlightRecorder::CsumLog(csum);
-    if ( !ok ) {
-      csum2=svm_xor(base,words);
-      std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
-    } else {
-      //      csum2=svm_xor(base,words);
-      //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
-    }
-    assert(ok);
-  }
+  //  uint64_t csum=0;
+  //  uint64_t csum2=0;
+  //  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
+  //  {
+  // Hack
+  // Fast integer xor checksum. Can also be used in comms now.
+  //    autoView(l_v,left,AcceleratorRead);
+  //    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
+  //    uint64_t *base= (uint64_t *)&l_v[0];
+  //    csum=svm_xor(base,words);
+  //    ok = FlightRecorder::CsumLog(csum);
+  //    if ( !ok ) {
+  //      csum2=svm_xor(base,words);
+  //      std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+  //    } else {
+  //      csum2=svm_xor(base,words);
+  //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+  //    }
+  //    GRID_ASSERT(ok);
+  // }
 #endif
   FlightRecorder::StepLog("rank inner product");
   ComplexD nrm = rankInnerProduct(left,right);
@@ -322,7 +322,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
     ComplexD nrm2 = rankInnerProduct(left,right);
     RealD local2 = real(nrm2);
     std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
-    assert(ok);
+    GRID_ASSERT(ok);
   }
   FlightRecorder::StepLog("Start global sum");
   grid->GlobalSumP2P(nrm);
@@ -376,40 +376,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
       coalescedWrite(z_v[ss],tmp);
   });
   bool ok;
-#ifdef GRID_SYCL
-  uint64_t csum=0;
-  uint64_t csum2=0;
-  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
-  {
-    // z_v
-    {
-      Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
-      uint64_t *base= (uint64_t *)&z_v[0];
-      csum=svm_xor(base,words);
-      ok = FlightRecorder::CsumLog(csum);
-      if ( !ok ) {
-	csum2=svm_xor(base,words);
-	std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
-      }
-      assert(ok);
-    }
-    // inner_v
-    {
-      Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
-      uint64_t *base= (uint64_t *)&inner_tmp_v[0];
-      csum=svm_xor(base,words);
-      ok = FlightRecorder::CsumLog(csum);
-      if ( !ok ) {
-	csum2=svm_xor(base,words);
-	std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
-      }
-      assert(ok);
-    }
-  }
-#endif
   nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
   ok = FlightRecorder::NormLog(real(nrm));
-  assert(ok);
+  GRID_ASSERT(ok);
   RealD local = real(nrm);
   grid->GlobalSum(nrm);
   FlightRecorder::ReductionLog(local,real(nrm));
@@ -495,13 +464,13 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
   typedef typename vobj::scalar_object sobj;
   typedef typename vobj::scalar_object::scalar_type scalar_type;
   GridBase  *grid = Data.Grid();
-  assert(grid!=NULL);
+  GRID_ASSERT(grid!=NULL);
 
   const int    Nd = grid->_ndimension;
   const int Nsimd = grid->Nsimd();
 
-  assert(orthogdim >= 0);
-  assert(orthogdim < Nd);
+  GRID_ASSERT(orthogdim >= 0);
+  GRID_ASSERT(orthogdim < Nd);
 
   int fd=grid->_fdimensions[orthogdim];
   int ld=grid->_ldimensions[orthogdim];
@@ -588,14 +557,14 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
   typedef typename vobj::vector_type   vector_type;
   typedef typename vobj::scalar_type   scalar_type;
   GridBase  *grid = lhs.Grid();
-  assert(grid!=NULL);
+  GRID_ASSERT(grid!=NULL);
   conformable(grid,rhs.Grid());
 
   const int    Nd = grid->_ndimension;
   const int Nsimd = grid->Nsimd();
 
-  assert(orthogdim >= 0);
-  assert(orthogdim < Nd);
+  GRID_ASSERT(orthogdim >= 0);
+  GRID_ASSERT(orthogdim < Nd);
 
   int fd=grid->_fdimensions[orthogdim];
   int ld=grid->_ldimensions[orthogdim];
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index 91cb8226..849f5309 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -208,7 +208,7 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
 
   Integer numThreads, numBlocks;
   int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
-  assert(ok);
+  GRID_ASSERT(ok);
 
   Integer smemSize = numThreads * sizeof(sobj);
   // Move out of UVM
diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h
index 292722c9..6d407036 100644
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -53,10 +53,10 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
 
   // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
   int lowerdims   = fine->_ndimension - coarse->_ndimension;
-  assert(lowerdims >= 0);
+  GRID_ASSERT(lowerdims >= 0);
   for(int d=0;d<lowerdims;d++){
-    assert(fine->_simd_layout[d]==1);
-    assert(fine->_processors[d]==1);
+    GRID_ASSERT(fine->_simd_layout[d]==1);
+    GRID_ASSERT(fine->_processors[d]==1);
   }
 
   int multiplicity=1;
@@ -66,9 +66,9 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
   // local and global volumes subdivide cleanly after SIMDization
   for(int d=0;d<rngdims;d++){
     int fd= d+lowerdims;
-    assert(coarse->_processors[d]  == fine->_processors[fd]);
-    assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
-    assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
+    GRID_ASSERT(coarse->_processors[d]  == fine->_processors[fd]);
+    GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
+    GRID_ASSERT(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
 
     multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
   }
@@ -83,18 +83,18 @@ inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
   int rngdims = coarse->_ndimension;
     
   // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
-  int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
+  int lowerdims   = fine->_ndimension - coarse->_ndimension;  GRID_ASSERT(lowerdims >= 0);
   // assumes that the higher dimensions are not using more processors
   // all further divisions are local
-  for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
-  for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
+  for(int d=0;d<lowerdims;d++) GRID_ASSERT(fine->_processors[d]==1);
+  for(int d=0;d<rngdims;d++) GRID_ASSERT(coarse->_processors[d] == fine->_processors[d+lowerdims]);
 
   // then divide the number of local sites
   // check that the total number of sims agree, meanse the iSites are the same
-  assert(fine->Nsimd() == coarse->Nsimd());
+  GRID_ASSERT(fine->Nsimd() == coarse->Nsimd());
 
   // check that the two grids divide cleanly
-  assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
+  GRID_ASSERT( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
 
   return fine->lSites() / coarse->lSites();
 }
@@ -177,7 +177,7 @@ public:
 
     skip = skip<<shift;
 
-    assert((skip >> shift)==site); // check for overflow
+    GRID_ASSERT((skip >> shift)==site); // check for overflow
 
     eng.discard(skip);
 #else
@@ -218,7 +218,7 @@ public:
     GetState(saved,_generators[gen]);
   }
   void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
-    assert(saved.size()==RngStateCount);
+    GRID_ASSERT(saved.size()==RngStateCount);
     std::stringstream ss;
     for(int i=0;i<RngStateCount;i++){
       ss<< saved[i]<<" ";
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 1081dc9b..20ca49eb 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -31,15 +31,15 @@ NAMESPACE_BEGIN(Grid);
 
 inline void subdivides(GridBase *coarse,GridBase *fine)
 {
-  assert(coarse->_ndimension == fine->_ndimension);
+  GRID_ASSERT(coarse->_ndimension == fine->_ndimension);
 
   int _ndimension = coarse->_ndimension;
 
   // local and global volumes subdivide cleanly after SIMDization
   for(int d=0;d<_ndimension;d++){
-    assert(coarse->_processors[d]  == fine->_processors[d]);
-    assert(coarse->_simd_layout[d] == fine->_simd_layout[d]);
-    assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); 
+    GRID_ASSERT(coarse->_processors[d]  == fine->_processors[d]);
+    GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[d]);
+    GRID_ASSERT((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); 
   }
 }
 
@@ -309,7 +309,7 @@ inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &co
                                const VLattice &Basis)
 {
   int NBatch = fineData.size();
-  assert(coarseData.size() == NBatch);
+  GRID_ASSERT(coarseData.size() == NBatch);
 
   GridBase * fine  = fineData[0].Grid();
   GridBase * coarse= coarseData[0].Grid();
@@ -344,7 +344,7 @@ template<class vobj,class vobj2,class CComplex>
   GridBase * coarse= coarseA.Grid();
 
   fineZ.Checkerboard()=fineX.Checkerboard();
-  assert(fineX.Checkerboard()==fineY.Checkerboard());
+  GRID_ASSERT(fineX.Checkerboard()==fineY.Checkerboard());
   subdivides(coarse,fine); // require they map
   conformable(fineX,fineY);
   conformable(fineX,fineZ);
@@ -356,7 +356,7 @@ template<class vobj,class vobj2,class CComplex>
   // FIXME merge with subdivide checking routine as this is redundant
   for(int d=0 ; d<_ndimension;d++){
     block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
-    assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
+    GRID_ASSERT(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
   }
 
   autoView( fineZ_  , fineZ, AcceleratorWrite);
@@ -613,7 +613,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
   int  _ndimension = coarse->_ndimension;
 
   // checks
-  assert( nbasis == Basis.size() );
+  GRID_ASSERT( nbasis == Basis.size() );
   subdivides(coarse,fine); 
   for(int i=0;i<nbasis;i++){
     conformable(Basis[i].Grid(),fine);
@@ -687,7 +687,7 @@ inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>
                                const VLattice &Basis)
 {
   int NBatch = coarseData.size();
-  assert(fineData.size() == NBatch);
+  GRID_ASSERT(fineData.size() == NBatch);
 
   GridBase * fine   = fineData[0].Grid();
   GridBase * coarse = coarseData[0].Grid();
@@ -715,12 +715,12 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
   int ni = ig->_ndimension;
   int no = og->_ndimension;
 
-  assert(ni == no);
+  GRID_ASSERT(ni == no);
 
   for(int d=0;d<no;d++){
-    assert(ig->_processors[d]  == og->_processors[d]);
-    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
-    assert(ig->lSites() == og->lSites());
+    GRID_ASSERT(ig->_processors[d]  == og->_processors[d]);
+    GRID_ASSERT(ig->_ldimensions[d] == og->_ldimensions[d]);
+    GRID_ASSERT(ig->lSites() == og->lSites());
   }
 
   autoView(in_v,in,CpuRead);
@@ -752,16 +752,16 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 
   GridBase *Fg = From.Grid();
   GridBase *Tg = To.Grid();
-  assert(!Fg->_isCheckerBoarded);
-  assert(!Tg->_isCheckerBoarded);
+  GRID_ASSERT(!Fg->_isCheckerBoarded);
+  GRID_ASSERT(!Tg->_isCheckerBoarded);
   int Nsimd = Fg->Nsimd();
   int nF = Fg->_ndimension;
   int nT = Tg->_ndimension;
   int nd = nF;
-  assert(nF == nT);
+  GRID_ASSERT(nF == nT);
 
   for(int d=0;d<nd;d++){
-    assert(Fg->_processors[d]  == Tg->_processors[d]);
+    GRID_ASSERT(Fg->_processors[d]  == Tg->_processors[d]);
   }
 
   ///////////////////////////////////////////////////////////
@@ -821,12 +821,12 @@ void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int
   //////////////////////////////////////////////////////////////////////////////////////////
   GridBase *Fg = From.Grid();
   GridBase *Tg = To.Grid();
-  assert(!Fg->_isCheckerBoarded);
-  assert(!Tg->_isCheckerBoarded);
+  GRID_ASSERT(!Fg->_isCheckerBoarded);
+  GRID_ASSERT(!Tg->_isCheckerBoarded);
   int Nsimd = Fg->Nsimd();
   int nF = Fg->_ndimension;
   int nT = Tg->_ndimension;
-  assert(nF+1 == nT);
+  GRID_ASSERT(nF+1 == nT);
 
   ///////////////////////////////////////////////////////////
   // do the index calc on the GPU
@@ -890,12 +890,12 @@ void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, in
   //////////////////////////////////////////////////////////////////////////////////////////
   GridBase *Fg = From.Grid();
   GridBase *Tg = To.Grid();
-  assert(!Fg->_isCheckerBoarded);
-  assert(!Tg->_isCheckerBoarded);
+  GRID_ASSERT(!Fg->_isCheckerBoarded);
+  GRID_ASSERT(!Tg->_isCheckerBoarded);
   int Nsimd = Fg->Nsimd();
   int nF = Fg->_ndimension;
   int nT = Tg->_ndimension;
-  assert(nT+1 == nF);
+  GRID_ASSERT(nT+1 == nF);
 
   ///////////////////////////////////////////////////////////
   // do the index calc on the GPU
@@ -955,16 +955,16 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
   int nl = lg->_ndimension;
   int nh = hg->_ndimension;
 
-  assert(nl+1 == nh);
-  assert(orthog<nh);
-  assert(orthog>=0);
-  assert(hg->_processors[orthog]==1);
+  GRID_ASSERT(nl+1 == nh);
+  GRID_ASSERT(orthog<nh);
+  GRID_ASSERT(orthog>=0);
+  GRID_ASSERT(hg->_processors[orthog]==1);
 
   int dl; dl = 0;
   for(int d=0;d<nh;d++){
     if ( d != orthog) {
-      assert(lg->_processors[dl]  == hg->_processors[d]);
-      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
+      GRID_ASSERT(lg->_processors[dl]  == hg->_processors[d]);
+      GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]);
       dl++;
     }
   }
@@ -1005,17 +1005,17 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
   int nl = lg->_ndimension;
   int nh = hg->_ndimension;
 
-  assert(nl+1 == nh);
-  assert(orthog<nh);
-  assert(orthog>=0);
-  assert(hg->_processors[orthog]==1);
+  GRID_ASSERT(nl+1 == nh);
+  GRID_ASSERT(orthog<nh);
+  GRID_ASSERT(orthog>=0);
+  GRID_ASSERT(hg->_processors[orthog]==1);
   lowDim.Checkerboard() = higherDim.Checkerboard();
 
   int dl; dl = 0;
   for(int d=0;d<nh;d++){
     if ( d != orthog) {
-      assert(lg->_processors[dl]  == hg->_processors[d]);
-      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
+      GRID_ASSERT(lg->_processors[dl]  == hg->_processors[d]);
+      GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]);
       dl++;
     }
   }
@@ -1056,14 +1056,14 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
   int nl = lg->_ndimension;
   int nh = hg->_ndimension;
 
-  assert(nl == nh);
-  assert(orthog<nh);
-  assert(orthog>=0);
+  GRID_ASSERT(nl == nh);
+  GRID_ASSERT(orthog<nh);
+  GRID_ASSERT(orthog>=0);
 
   for(int d=0;d<nh;d++){
     if ( d!=orthog ) {
-      assert(lg->_processors[d]  == hg->_processors[d]);
-      assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+      GRID_ASSERT(lg->_processors[d]  == hg->_processors[d]);
+      GRID_ASSERT(lg->_ldimensions[d] == hg->_ldimensions[d]);
     }
   }
   Coordinate sz = lg->_ldimensions;
@@ -1093,7 +1093,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 
   subdivides(cg,fg); 
 
-  assert(cg->_ndimension==fg->_ndimension);
+  GRID_ASSERT(cg->_ndimension==fg->_ndimension);
 
   Coordinate ratio(cg->_ndimension);
 
@@ -1157,7 +1157,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
 
       int lex;
       Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
-      assert(lex < out.size());
+      GRID_ASSERT(lex < out.size());
       out_ptrs[lane] = &out[lex];
     }
     
@@ -1221,7 +1221,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
   typedef typename vobj::vector_type vtype;
   
   GridBase* grid = out.Grid();
-  assert(in.size()==grid->lSites());
+  GRID_ASSERT(in.size()==grid->lSites());
   
   const int ndim     = grid->Nd();
   constexpr int nsimd    = vtype::Nsimd();
@@ -1268,7 +1268,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
   typedef typename vobj::vector_type vtype;
   
   GridBase* grid = out._grid;
-  assert(in.size()==grid->lSites());
+  GRID_ASSERT(in.size()==grid->lSites());
   
   int ndim     = grid->Nd();
   int nsimd    = vtype::Nsimd();
@@ -1329,9 +1329,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 template<class VobjOut, class VobjIn>
 void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
 {
-  assert(out.Grid()->Nd() == in.Grid()->Nd());
+  GRID_ASSERT(out.Grid()->Nd() == in.Grid()->Nd());
   for(int d=0;d<out.Grid()->Nd();d++){
-    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
+    GRID_ASSERT(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
   }
   out.Checkerboard() = in.Checkerboard();
   GridBase *in_grid=in.Grid();
@@ -1382,9 +1382,9 @@ class precisionChangeWorkspace{
 public:
   precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
     //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
-    assert(out_grid->Nd() == in_grid->Nd());
+    GRID_ASSERT(out_grid->Nd() == in_grid->Nd());
     for(int d=0;d<out_grid->Nd();d++){
-      assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
+      GRID_ASSERT(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
     }
     int Nsimd_out = out_grid->Nsimd();
 
@@ -1549,7 +1549,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 
   int full_vecs   = full.size();
 
-  assert(full_vecs>=1);
+  GRID_ASSERT(full_vecs>=1);
 
   GridBase * full_grid = full[0].Grid();
   GridBase *split_grid = split.Grid();
@@ -1567,18 +1567,18 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
   //////////////////////////////
   // Checks
   //////////////////////////////
-  assert(full_grid->_ndimension==split_grid->_ndimension);
+  GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension);
   for(int n=0;n<full_vecs;n++){
-    assert(full[n].Checkerboard() == cb);
+    GRID_ASSERT(full[n].Checkerboard() == cb);
     for(int d=0;d<ndim;d++){
-      assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
-      assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
+      GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
+      GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
     }
   }
 
   int   nvector   =full_nproc/split_nproc; 
-  assert(nvector*split_nproc==full_nproc);
-  assert(nvector == full_vecs);
+  GRID_ASSERT(nvector*split_nproc==full_nproc);
+  GRID_ASSERT(nvector == full_vecs);
 
   Coordinate ratio(ndim);
   for(int d=0;d<ndim;d++){
@@ -1622,7 +1622,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 
       int fvol   = lsites;
       
-      int chunk  = (nvec*fvol)/sP;          assert(chunk*sP == nvec*fvol);
+      int chunk  = (nvec*fvol)/sP;          GRID_ASSERT(chunk*sP == nvec*fvol);
 
       // Loop over reordered data post A2A
       thread_for(c, chunk, {
@@ -1675,7 +1675,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 
   int full_vecs   = full.size();
 
-  assert(full_vecs>=1);
+  GRID_ASSERT(full_vecs>=1);
 
   GridBase * full_grid = full[0].Grid();
   GridBase *split_grid = split.Grid();
@@ -1693,18 +1693,18 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
   //////////////////////////////
   // Checks
   //////////////////////////////
-  assert(full_grid->_ndimension==split_grid->_ndimension);
+  GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension);
   for(int n=0;n<full_vecs;n++){
-    assert(full[n].Checkerboard() == cb);
+    GRID_ASSERT(full[n].Checkerboard() == cb);
     for(int d=0;d<ndim;d++){
-      assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
-      assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
+      GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
+      GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
     }
   }
 
   int   nvector   =full_nproc/split_nproc; 
-  assert(nvector*split_nproc==full_nproc);
-  assert(nvector == full_vecs);
+  GRID_ASSERT(nvector*split_nproc==full_nproc);
+  GRID_ASSERT(nvector == full_vecs);
 
   Coordinate ratio(ndim);
   for(int d=0;d<ndim;d++){
@@ -1740,7 +1740,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
       auto lsites= rsites/M;                // Decreases rsites by M
       
       int fvol   = lsites;
-      int chunk  = (nvec*fvol)/sP;          assert(chunk*sP == nvec*fvol);
+      int chunk  = (nvec*fvol)/sP;          GRID_ASSERT(chunk*sP == nvec*fvol);
 	
       {
 	// Loop over reordered data post A2A
diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h
index 1df4e6e0..4affe542 100644
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -123,7 +123,7 @@ public:
     case AcceleratorWrite:
     case CpuRead:
     case CpuWrite:
-      ViewLogger::Log(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
+      ViewLogger::LogOpen(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
       break;
     } 
     
@@ -134,7 +134,7 @@ public:
     case AcceleratorWriteDiscard:
     case AcceleratorWrite:
     case CpuWrite:
-      ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
+      ViewLogger::LogClose(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
       break;
     }
     
diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h
index 0340698c..23f6c13a 100644
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -82,10 +82,10 @@ template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
 
   int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
   int rNsimda= Nsimd/simd[dim]; // should be equal
-  assert(rNsimda==rNsimd);
+  GRID_ASSERT(rNsimda==rNsimd);
   int face_ovol=block*nblock;
 
-  //  assert(buf.size()==face_ovol*rNsimd);
+  //  GRID_ASSERT(buf.size()==face_ovol*rNsimd);
 
   /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
   //Let's make it work on GPU and then make a special accelerator_for that
@@ -172,7 +172,7 @@ template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
   
   int face_ovol=block*nblock;
 
-  //  assert(buf.size()==face_ovol*rNsimd);
+  //  GRID_ASSERT(buf.size()==face_ovol*rNsimd);
 
   /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
   //Let's make it work on GPU and then make a special accelerator_for that
@@ -247,7 +247,7 @@ public:
     Coordinate local     =unpadded_grid->LocalDimensions();
     Coordinate procs     =unpadded_grid->ProcessorGrid();
     for(int d=0;d<dims;d++){
-      if ( procs[d] > 1 ) assert(local[d]>=depth);
+      if ( procs[d] > 1 ) GRID_ASSERT(local[d]>=depth);
     }
   }
   void DeleteGrids(void)
@@ -448,9 +448,9 @@ public:
     int nld   = to.Grid()->_ldimensions[dimension];
     const int Nsimd = vobj::Nsimd();
 
-    assert(depth<=lds[dimension]); // A must be on neighbouring node
-    assert(depth>0);   // A caller bug if zero
-    assert(ld+2*depth==nld);
+    GRID_ASSERT(depth<=lds[dimension]); // A must be on neighbouring node
+    GRID_ASSERT(depth>0);   // A caller bug if zero
+    GRID_ASSERT(ld+2*depth==nld);
     ////////////////////////////////////////////////////////////////////////////
     // Face size and byte calculations
     ////////////////////////////////////////////////////////////////////////////
@@ -460,7 +460,7 @@ public:
     }
     buffer_size = buffer_size  / Nsimd;
     int rNsimd = Nsimd / simd[dimension];
-    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
+    GRID_ASSERT( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
 
     static deviceVector<vobj> send_buf; 
     static deviceVector<vobj> recv_buf;
diff --git a/Grid/log/Log.h b/Grid/log/Log.h
index ec6becd6..c761db00 100644
--- a/Grid/log/Log.h
+++ b/Grid/log/Log.h
@@ -33,10 +33,6 @@
 #ifndef GRID_LOG_H
 #define GRID_LOG_H
 
-#ifdef HAVE_EXECINFO_H
-#include <execinfo.h>
-#endif
-
 NAMESPACE_BEGIN(Grid);
 
 //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -227,8 +223,6 @@ inline void Grid_pass(Args&&... args) {
     std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
 }
 
-#define _NBACKTRACE (256)
-extern void * Grid_backtrace_buffer[_NBACKTRACE];
 
 #define BACKTRACEFILE() {						\
     char string[20];							\
diff --git a/Grid/parallelIO/BinaryIO.h b/Grid/parallelIO/BinaryIO.h
index 32964565..4df9fdf9 100644
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -293,9 +293,9 @@ class BinaryIO {
     // Flatten the file
     uint64_t lsites = grid->lSites();
     if ( control & BINARYIO_MASTER_APPEND )  {
-      assert(iodata.size()==1);
+      GRID_ASSERT(iodata.size()==1);
     } else {
-      assert(lsites==iodata.size());
+      GRID_ASSERT(lsites==iodata.size());
     }
     for(int d=0;d<ndim;d++){
       gStart[d] = lLattice[d]*pcoor[d];
@@ -326,20 +326,20 @@ class BinaryIO {
     // Sobj in MPI phrasing
     //////////////////////////////////////////////////////////////////////////////
     int ierr;
-    ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject);    assert(ierr==0);
+    ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject);    GRID_ASSERT(ierr==0);
     ierr = MPI_Type_commit(&mpiObject);
 
     //////////////////////////////////////////////////////////////////////////////
     // File global array data type
     //////////////////////////////////////////////////////////////////////////////
-    ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray);    assert(ierr==0);
-    ierr=MPI_Type_commit(&fileArray);    assert(ierr==0);
+    ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray);    GRID_ASSERT(ierr==0);
+    ierr=MPI_Type_commit(&fileArray);    GRID_ASSERT(ierr==0);
 
     //////////////////////////////////////////////////////////////////////////////
     // local lattice array
     //////////////////////////////////////////////////////////////////////////////
-    ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray);    assert(ierr==0);
-    ierr=MPI_Type_commit(&localArray);    assert(ierr==0);
+    ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray);    GRID_ASSERT(ierr==0);
+    ierr=MPI_Type_commit(&localArray);    GRID_ASSERT(ierr==0);
 #endif
 
     //////////////////////////////////////////////////////////////////////////////
@@ -349,8 +349,8 @@ class BinaryIO {
     int ieee32    = (format == std::string("IEEE32"));
     int ieee64big = (format == std::string("IEEE64BIG"));
     int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
-    assert(ieee64||ieee32|ieee64big||ieee32big);
-    assert((ieee64+ieee32+ieee64big+ieee32big)==1);
+    GRID_ASSERT(ieee64||ieee32|ieee64big||ieee32big);
+    GRID_ASSERT((ieee64+ieee32+ieee64big+ieee32big)==1);
     //////////////////////////////////////////////////////////////////////////////
     // Do the I/O
     //////////////////////////////////////////////////////////////////////////////
@@ -361,9 +361,9 @@ class BinaryIO {
       if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
 	std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
-	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
-	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
-	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
+	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    GRID_ASSERT(ierr==0);
+	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    GRID_ASSERT(ierr==0);
+	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    GRID_ASSERT(ierr==0);
 	MPI_File_close(&fh);
 	MPI_Type_free(&fileArray);
 	MPI_Type_free(&localArray);
@@ -384,13 +384,14 @@ class BinaryIO {
           fin.seekg(offset + myrank * lsites * sizeof(fobj));
         }
         fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
-        assert(fin.fail() == 0);
+        GRID_ASSERT(fin.fail() == 0);
         fin.close();
       }
-      timer.Stop();
-
+      
       grid->Barrier();
 
+	  timer.Stop();
+
       bstimer.Start();
       ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
       if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
@@ -435,11 +436,11 @@ class BinaryIO {
 
         std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
         ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
-        assert(ierr == 0);
+        GRID_ASSERT(ierr == 0);
 
         std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
         ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
-        assert(ierr == 0);
+        GRID_ASSERT(ierr == 0);
 
         MPI_Offset os;
         MPI_File_get_position(fh, &os);
@@ -506,6 +507,7 @@ class BinaryIO {
   offset  = fout.tellp();
 	fout.close();
       }
+      grid->Barrier();
       timer.Stop();
     }
     
diff --git a/Grid/parallelIO/IldgIO.h b/Grid/parallelIO/IldgIO.h
index 12d97afc..9851892b 100644
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -290,7 +290,7 @@ class GridLimeReader : public BinaryIO {
 	return;
       }      
     }
-    assert(0);
+    GRID_ASSERT(0);
   }
   ////////////////////////////////////////////
   // Read a generic serialisable object
@@ -315,7 +315,7 @@ class GridLimeReader : public BinaryIO {
       }
 
     }  
-    assert(0);
+    GRID_ASSERT(0);
   }
 
   template<class serialisable_object>
@@ -349,7 +349,7 @@ class GridLimeWriter : public BinaryIO
      filename= _filename;
      if ( boss_node ) {
        File = fopen(filename.c_str(), "w");
-       LimeW = limeCreateWriter(File); assert(LimeW != NULL );
+       LimeW = limeCreateWriter(File); GRID_ASSERT(LimeW != NULL );
      }
    }
    /////////////////////////////////////////////
@@ -369,7 +369,7 @@ class GridLimeWriter : public BinaryIO
     if ( boss_node ) {
       LimeRecordHeader *h;
       h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
-      assert(limeWriteRecordHeader(h, LimeW) >= 0);
+      GRID_ASSERT(limeWriteRecordHeader(h, LimeW) >= 0);
       limeDestroyHeader(h);
     }
     return LIME_SUCCESS;
@@ -387,11 +387,11 @@ class GridLimeWriter : public BinaryIO
       //    std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
       int err;
       LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); 
-      assert(h!= NULL);
+      GRID_ASSERT(h!= NULL);
       
-      err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
-      err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
-      err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
+      err=limeWriteRecordHeader(h, LimeW);                    GRID_ASSERT(err>=0);
+      err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); GRID_ASSERT(err>=0);
+      err=limeWriterCloseRecord(LimeW);                       GRID_ASSERT(err>=0);
       limeDestroyHeader(h);
     }
   }
@@ -432,7 +432,7 @@ class GridLimeWriter : public BinaryIO
     ////////////////////////////////////////////////////////////////////
     
     GridBase *grid = field.Grid();
-    assert(boss_node == field.Grid()->IsBoss() );
+    GRID_ASSERT(boss_node == field.Grid()->IsBoss() );
 
     FieldNormMetaData FNMD; FNMD.norm2 = norm2(field);
 
@@ -474,7 +474,7 @@ class GridLimeWriter : public BinaryIO
     if ( boss_node ) {
       fseek(File,0,SEEK_END);             
       uint64_t offset2 = ftello(File);     //    std::cout << " now at offset "<<offset2 << std::endl;
-      assert( (offset2-offset1) == PayloadSize);
+      GRID_ASSERT( (offset2-offset1) == PayloadSize);
     }
 
     /////////////////////////////////////////////////////////////
@@ -482,7 +482,7 @@ class GridLimeWriter : public BinaryIO
     /////////////////////////////////////////////////////////////
 
     if ( boss_node ) { 
-      err=limeWriterCloseRecord(LimeW);  assert(err>=0);
+      err=limeWriterCloseRecord(LimeW);  GRID_ASSERT(err>=0);
     }
     ////////////////////////////////////////
     // Write checksum element, propagaing forward from the BinaryIO
@@ -622,8 +622,8 @@ class IldgWriter : public ScidacWriter {
     uint64_t PayloadSize = LFN.size();
     int err;
     createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
-    err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
-    err=limeWriterCloseRecord(LimeW); assert(err>=0);
+    err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); GRID_ASSERT(err>=0);
+    err=limeWriterCloseRecord(LimeW); GRID_ASSERT(err>=0);
   }
 
   ////////////////////////////////////////////////////////////////
@@ -657,7 +657,7 @@ class IldgWriter : public ScidacWriter {
     header.sequence_number = sequence;
     header.ildg_lfn = LFN;
 
-    assert ( (format == std::string("IEEE32BIG"))  
+    GRID_ASSERT ( (format == std::string("IEEE32BIG"))  
            ||(format == std::string("IEEE64BIG")) );
 
     //////////////////////////////////////////////////////
@@ -677,8 +677,8 @@ class IldgWriter : public ScidacWriter {
     ildgfmt.ly = header.dimension[1];
     ildgfmt.lz = header.dimension[2];
     ildgfmt.lt = header.dimension[3];
-    assert(header.nd==4);
-    assert(header.nd==header.dimension.size());
+    GRID_ASSERT(header.nd==4);
+    GRID_ASSERT(header.nd==header.dimension.size());
 
     //////////////////////////////////////////////////////////////////////////////
     // Field norm tests
@@ -735,7 +735,7 @@ class IldgReader : public GridLimeReader {
 
     Coordinate dims = Umu.Grid()->FullDimensions();
 
-    assert(dims.size()==4);
+    GRID_ASSERT(dims.size()==4);
 
     // Metadata holders
     ildgFormat     ildgFormat_    ;
@@ -794,10 +794,10 @@ class IldgReader : public GridLimeReader {
 	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
 	  if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
 
-	  assert( ildgFormat_.lx == dims[0]);
-	  assert( ildgFormat_.ly == dims[1]);
-	  assert( ildgFormat_.lz == dims[2]);
-	  assert( ildgFormat_.lt == dims[3]);
+	  GRID_ASSERT( ildgFormat_.lx == dims[0]);
+	  GRID_ASSERT( ildgFormat_.ly == dims[1]);
+	  GRID_ASSERT( ildgFormat_.lz == dims[2]);
+	  GRID_ASSERT( ildgFormat_.lt == dims[3]);
 
 	  found_ildgFormat = 1;
 	}
@@ -814,10 +814,10 @@ class IldgReader : public GridLimeReader {
 
 	  format = FieldMetaData_.floating_point;
 
-	  assert(FieldMetaData_.dimension[0] == dims[0]);
-	  assert(FieldMetaData_.dimension[1] == dims[1]);
-	  assert(FieldMetaData_.dimension[2] == dims[2]);
-	  assert(FieldMetaData_.dimension[3] == dims[3]);
+	  GRID_ASSERT(FieldMetaData_.dimension[0] == dims[0]);
+	  GRID_ASSERT(FieldMetaData_.dimension[1] == dims[1]);
+	  GRID_ASSERT(FieldMetaData_.dimension[2] == dims[2]);
+	  GRID_ASSERT(FieldMetaData_.dimension[3] == dims[3]);
 
 	  found_FieldMetaData = 1;
 	}
@@ -867,13 +867,13 @@ class IldgReader : public GridLimeReader {
     // Minimally must find binary segment and checksum
     // Since this is an ILDG reader require ILDG format
     //////////////////////////////////////////////////////
-    assert(found_ildgLFN);
-    assert(found_ildgBinary);
-    assert(found_ildgFormat);
-    assert(found_scidacChecksum);
+    GRID_ASSERT(found_ildgLFN);
+    GRID_ASSERT(found_ildgBinary);
+    GRID_ASSERT(found_ildgFormat);
+    GRID_ASSERT(found_scidacChecksum);
 
     // Must find something with the lattice dimensions
-    assert(found_FieldMetaData||found_ildgFormat);
+    GRID_ASSERT(found_FieldMetaData||found_ildgFormat);
 
     if ( found_FieldMetaData ) {
 
@@ -881,9 +881,9 @@ class IldgReader : public GridLimeReader {
 
     } else { 
 
-      assert(found_ildgFormat);
+      GRID_ASSERT(found_ildgFormat);
       const std::string stNC = std::to_string( Nc ) ;
-      assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
+      GRID_ASSERT ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
 
       ///////////////////////////////////////////////////////////////////////////////////////
       // Populate our Grid metadata as best we can
@@ -928,20 +928,20 @@ class IldgReader : public GridLimeReader {
       FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
       FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
       scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
-      assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
-      assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
+      GRID_ASSERT( scidac_csuma ==FieldMetaData_.scidac_checksuma);
+      GRID_ASSERT( scidac_csumb ==FieldMetaData_.scidac_checksumb);
       std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
     } else { 
       std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
-      assert(0); // Can I insist always checksum ?
+      GRID_ASSERT(0); // Can I insist always checksum ?
     }
 
     if ( found_FieldMetaData || found_usqcdInfo ) {
       FieldMetaData checker;
       stats Stats;
       Stats(Umu,checker);
-      assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
-      assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
+      GRID_ASSERT(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
+      GRID_ASSERT(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
       std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
     }
   }
diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h
index 6b9d8708..1fc3fe31 100644
--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -203,7 +203,7 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
 //////////////////////////////////////////////////////////////////////
 inline void reconstruct3(LorentzColourMatrix & cm)
 {
-  assert( Nc < 4 && Nc > 1 ) ;
+  GRID_ASSERT( Nc < 4 && Nc > 1 ) ;
   for(int mu=0;mu<Nd;mu++){
     #if Nc == 2
       cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
@@ -240,7 +240,7 @@ struct BinarySimpleUnmunger {
     sobj_stype *in_buffer = (sobj_stype *)&in;
     size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
     size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
-    assert(fobj_words == sobj_words);
+    GRID_ASSERT(fobj_words == sobj_words);
     
     for (unsigned int word = 0; word < sobj_words; word++)
       out_buffer[word] = in_buffer[word];  // type conversion on the fly
@@ -259,7 +259,7 @@ struct BinarySimpleMunger {
     sobj_stype *out_buffer = (sobj_stype *)&out;
     size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
     size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
-    assert(fobj_words == sobj_words);
+    GRID_ASSERT(fobj_words == sobj_words);
     
     for (unsigned int word = 0; word < sobj_words; word++)
       out_buffer[word] = in_buffer[word];  // type conversion on the fly
diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h
index b0bce0fb..cfdc812b 100644
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -76,7 +76,7 @@ public:
     removeWhitespace(line);
     std::cout << GridLogMessage << "* " << line << std::endl;
 
-    assert(line==std::string("BEGIN_HEADER"));
+    GRID_ASSERT(line==std::string("BEGIN_HEADER"));
 
     do {
       getline(fin,line); // read one line
@@ -106,9 +106,9 @@ public:
     field.dimension[2] = std::stol(header["DIMENSION_3"]);
     field.dimension[3] = std::stol(header["DIMENSION_4"]);
 
-    assert(grid->_ndimension == 4);
+    GRID_ASSERT(grid->_ndimension == 4);
     for(int d=0;d<4;d++){
-      assert(grid->_fdimensions[d]==field.dimension[d]);
+      GRID_ASSERT(grid->_fdimensions[d]==field.dimension[d]);
     }
 
     field.link_trace = std::stod(header["LINK_TRACE"]);
@@ -183,7 +183,7 @@ public:
 	   nersc_csum,scidac_csuma,scidac_csumb);
       }
     } else {
-      assert(0);
+      GRID_ASSERT(0);
     }
 
     GaugeStats Stats; Stats(Umu,clone);
@@ -205,9 +205,9 @@ public:
       std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
       exit(0);
     }
-    if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
-    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
-    assert(nersc_csum == header.checksum );
+    if(exitOnReadPlaquetteMismatch()) GRID_ASSERT(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
+    GRID_ASSERT(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
+    GRID_ASSERT(nersc_csum == header.checksum );
       
     std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
   }
@@ -246,7 +246,7 @@ public:
     GridBase *grid = Umu.Grid();
 
     GridMetaData(grid,header);
-    assert(header.nd==4);
+    GRID_ASSERT(header.nd==4);
     GaugeStats Stats; Stats(Umu,header);
     MachineCharacteristics(header);
 
@@ -302,7 +302,7 @@ public:
     GridBase *grid = parallel.Grid();
 
     GridMetaData(grid,header);
-    assert(header.nd==4);
+    GRID_ASSERT(header.nd==4);
     header.link_trace=0.0;
     header.plaquette=0.0;
     MachineCharacteristics(header);
@@ -355,16 +355,16 @@ public:
     std::string data_type(header.data_type);
 
 #ifdef RNG_RANLUX
-    assert(format == std::string("UINT64"));
-    assert(data_type == std::string("RANLUX48"));
+    GRID_ASSERT(format == std::string("UINT64"));
+    GRID_ASSERT(data_type == std::string("RANLUX48"));
 #endif
 #ifdef RNG_MT19937
-    assert(format == std::string("UINT32"));
-    assert(data_type == std::string("MT19937"));
+    GRID_ASSERT(format == std::string("UINT32"));
+    GRID_ASSERT(data_type == std::string("MT19937"));
 #endif
 #ifdef RNG_SITMO
-    assert(format == std::string("UINT64"));
-    assert(data_type == std::string("SITMO"));
+    GRID_ASSERT(format == std::string("UINT64"));
+    GRID_ASSERT(data_type == std::string("SITMO"));
 #endif
 
     // depending on datatype, set up munger;
@@ -376,7 +376,7 @@ public:
       std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
       exit(0);
     }
-    assert(nersc_csum == header.checksum );
+    GRID_ASSERT(nersc_csum == header.checksum );
 
     std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
   }
diff --git a/Grid/parallelIO/OpenQcdIO.h b/Grid/parallelIO/OpenQcdIO.h
index 0be2c88d..fddd9919 100644
--- a/Grid/parallelIO/OpenQcdIO.h
+++ b/Grid/parallelIO/OpenQcdIO.h
@@ -49,7 +49,7 @@ public:
     {
       std::ifstream fin(file, std::ios::in | std::ios::binary);
       fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
-      assert(!fin.fail());
+      GRID_ASSERT(!fin.fail());
       field.data_start = fin.tellg();
       fin.close();
     }
@@ -57,10 +57,10 @@ public:
     header.plaq /= normalisationFactor;
 
     // sanity check (should trigger on endian issues)
-    assert(0 < header.Nt && header.Nt <= 1024);
-    assert(0 < header.Nx && header.Nx <= 1024);
-    assert(0 < header.Ny && header.Ny <= 1024);
-    assert(0 < header.Nz && header.Nz <= 1024);
+    GRID_ASSERT(0 < header.Nt && header.Nt <= 1024);
+    GRID_ASSERT(0 < header.Nx && header.Nx <= 1024);
+    GRID_ASSERT(0 < header.Ny && header.Ny <= 1024);
+    GRID_ASSERT(0 < header.Nz && header.Nz <= 1024);
 
     field.dimension[0] = header.Nx;
     field.dimension[1] = header.Ny;
@@ -71,9 +71,9 @@ public:
     std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
     std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
 
-    assert(grid->_ndimension == Nd);
+    GRID_ASSERT(grid->_ndimension == Nd);
     for(int d = 0; d < Nd; d++)
-      assert(grid->_fdimensions[d] == field.dimension[d]);
+      GRID_ASSERT(grid->_fdimensions[d] == field.dimension[d]);
 
     field.plaquette = header.plaq;
 
@@ -86,10 +86,10 @@ public:
                                        std::string                           file) {
     typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
 
-    assert(Ns == 4 and Nd == 4 and Nc == 3);
+    GRID_ASSERT(Ns == 4 and Nd == 4 and Nc == 3);
 
     auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
-    assert(grid != nullptr); assert(grid->_ndimension == Nd);
+    GRID_ASSERT(grid != nullptr); GRID_ASSERT(grid->_ndimension == Nd);
 
     uint64_t offset = readHeader(file, Umu.Grid(), header);
 
@@ -171,7 +171,7 @@ public:
 
     if(plaq_diff >= tol)
       std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
-    assert(plaq_diff < tol);
+    GRID_ASSERT(plaq_diff < tol);
 
     std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
   }
diff --git a/Grid/parallelIO/OpenQcdIOChromaReference.h b/Grid/parallelIO/OpenQcdIOChromaReference.h
index 886536ad..c2bf22e7 100644
--- a/Grid/parallelIO/OpenQcdIOChromaReference.h
+++ b/Grid/parallelIO/OpenQcdIOChromaReference.h
@@ -62,7 +62,7 @@ public:
     : swap(false)
     , grid(gridPtr) {
     err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
-    assert(err == MPI_SUCCESS);
+    GRID_ASSERT(err == MPI_SUCCESS);
   }
 
   virtual ~ParRdr() { MPI_File_close(&fp); }
@@ -76,8 +76,8 @@ public:
   }
 
   int readHeader(FieldMetaData& field) {
-    assert((grid->_ndimension == Nd) && (Nd == 4));
-    assert(Nc == 3);
+    GRID_ASSERT((grid->_ndimension == Nd) && (Nd == 4));
+    GRID_ASSERT(Nc == 3);
 
     OpenQcdHeader header;
 
@@ -86,10 +86,10 @@ public:
     header.plaq /= 3.; // TODO change this into normalizationfactor
 
     // sanity check (should trigger on endian issues) TODO remove?
-    assert(0 < header.Nt && header.Nt <= 1024);
-    assert(0 < header.Nx && header.Nx <= 1024);
-    assert(0 < header.Ny && header.Ny <= 1024);
-    assert(0 < header.Nz && header.Nz <= 1024);
+    GRID_ASSERT(0 < header.Nt && header.Nt <= 1024);
+    GRID_ASSERT(0 < header.Nx && header.Nx <= 1024);
+    GRID_ASSERT(0 < header.Ny && header.Ny <= 1024);
+    GRID_ASSERT(0 < header.Nz && header.Nz <= 1024);
 
     field.dimension[0] = header.Nx;
     field.dimension[1] = header.Ny;
@@ -97,7 +97,7 @@ public:
     field.dimension[3] = header.Nt;
 
     for(int d = 0; d < Nd; d++)
-      assert(grid->FullDimensions()[d] == field.dimension[d]);
+      GRID_ASSERT(grid->FullDimensions()[d] == field.dimension[d]);
 
     field.plaquette = header.plaq;
 
@@ -114,15 +114,15 @@ public:
     int read = -1;
     MPI_Get_count(&status, datatype, &read);
     // CHECK_VAR(read)
-    assert(nbytes == (uint64_t)read);
-    assert(err == MPI_SUCCESS);
+    GRID_ASSERT(nbytes == (uint64_t)read);
+    GRID_ASSERT(err == MPI_SUCCESS);
   }
 
   void createTypes() {
     constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
 
-    err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
-    err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); GRID_ASSERT(err == MPI_SUCCESS);
+    err = MPI_Type_commit(&oddSiteType); GRID_ASSERT(err == MPI_SUCCESS);
 
     Coordinate const L = grid->GlobalDimensions();
     Coordinate const l = grid->LocalDimensions();
@@ -132,20 +132,20 @@ public:
     Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
     Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
 
-    err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
-    err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); GRID_ASSERT(err == MPI_SUCCESS);
+    err = MPI_Type_commit(&fileViewType); GRID_ASSERT(err == MPI_SUCCESS);
   }
 
   void freeTypes() {
-    err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
-    err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
+    err = MPI_Type_free(&fileViewType); GRID_ASSERT(err == MPI_SUCCESS);
+    err = MPI_Type_free(&oddSiteType); GRID_ASSERT(err == MPI_SUCCESS);
   }
 
   bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
     auto hdr_offset = readHeader(meta);
     CHECK
     createTypes();
-    err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
+    err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); GRID_ASSERT(err == MPI_SUCCESS);
     CHECK
     int const domainSites = grid->lSites();
     domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
@@ -166,7 +166,7 @@ public:
     CHECK
     err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
   errInfo(err, "MPI_File_set_view1");
-    assert(err == MPI_SUCCESS);
+    GRID_ASSERT(err == MPI_SUCCESS);
     freeTypes();
 
     std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
@@ -182,7 +182,7 @@ public:
                                        std::string                           file) {
     typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
 
-    assert(Ns == 4 and Nd == 4 and Nc == 3);
+    GRID_ASSERT(Ns == 4 and Nd == 4 and Nc == 3);
 
     auto grid = Umu.Grid();
 
@@ -225,7 +225,7 @@ public:
 
     if(plaq_diff >= tol)
       std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
-    assert(plaq_diff < tol);
+    GRID_ASSERT(plaq_diff < tol);
 
     std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
   }
@@ -246,7 +246,7 @@ private:
   static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
                                          std::vector<ColourMatrixD> const&      node_buff,
                                          GridBase*                              grid) {
-    assert(node_buff.size() == Nd * grid->lSites());
+    GRID_ASSERT(node_buff.size() == Nd * grid->lSites());
 
     Coordinate const& l = grid->LocalDimensions();
 
@@ -274,7 +274,7 @@ private:
             buff_idx += 2 * Nd;
           }
 
-    assert(node_buff.size() == buff_idx);
+    GRID_ASSERT(node_buff.size() == buff_idx);
   }
 };
 
diff --git a/Grid/perfmon/PerfCount.h b/Grid/perfmon/PerfCount.h
index 62b2a740..57bcc538 100644
--- a/Grid/perfmon/PerfCount.h
+++ b/Grid/perfmon/PerfCount.h
@@ -146,8 +146,8 @@ public:
 
   PerformanceCounter(int _pct) {
 #ifdef __linux__
-    assert(_pct>=0);
-    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
+    GRID_ASSERT(_pct>=0);
+    GRID_ASSERT(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
     fd=-1;
     cyclefd=-1;
     count=0;
@@ -213,7 +213,7 @@ public:
       ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
       ign=::read(fd, &count, sizeof(long long));
       ign+=::read(cyclefd, &cycles, sizeof(long long));
-      assert(ign==2*sizeof(long long));
+      GRID_ASSERT(ign==2*sizeof(long long));
     }
     elapsed = cyclecount() - begin;
 #else
diff --git a/Grid/perfmon/Stat.cc b/Grid/perfmon/Stat.cc
index 4c3be254..06262012 100644
--- a/Grid/perfmon/Stat.cc
+++ b/Grid/perfmon/Stat.cc
@@ -150,7 +150,7 @@ void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
   }
   int type;
   int ret = fscanf(fp, "%d", &type);
-  assert(ret == 1);
+  GRID_ASSERT(ret == 1);
   fclose(fp);
   //  std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
 
diff --git a/Grid/perfmon/Timer.h b/Grid/perfmon/Timer.h
index ba5df85a..7434322f 100644
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -60,12 +60,16 @@ inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 }
 inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now)
 {
+  double secs = 1.0*now.count()*1.0e-3;
+  stream << secs<<" s";
+  /*
   GridSecs second(1);
   auto     secs       = now/second ; 
   auto     subseconds = now%second ;
   auto     fill       = stream.fill();
   stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
   stream.fill(fill);
+  */
   return stream;
 }
 inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
@@ -90,14 +94,14 @@ public:
     Reset();
   }
   void     Start(void) { 
-    assert(running == false);
+    GRID_ASSERT(running == false);
 #ifdef TIMERS_ON
     start = GridClock::now(); 
 #endif
     running = true;
   }
   void     Stop(void)  { 
-    assert(running == true);
+    GRID_ASSERT(running == true);
 #ifdef TIMERS_ON
     accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
 #endif
@@ -111,11 +115,11 @@ public:
     accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
   }
   GridTime Elapsed(void) const {
-    assert(running == false);
+    GRID_ASSERT(running == false);
     return std::chrono::duration_cast<GridTime>( accumulator );
   }
   uint64_t useconds(void) const {
-    assert(running == false);
+    GRID_ASSERT(running == false);
     return (uint64_t) accumulator.count();
   }
   bool isRunning(void) const {
diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h
index dbedfa7c..5b35dc64 100644
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -596,16 +596,32 @@ template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
 //////////////////////////////////////////
 // Trace lattice and non-lattice
 //////////////////////////////////////////
+#define GRID_UNOP(name)   name
+#define GRID_DEF_UNOP(op, name)						\
+  template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
+  inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \
+  {									\
+    return     LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \
+  }
+
 template<int Index,class vobj>
 inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(vobj()))>
 {
   return traceIndex<SpinIndex>(lhs);
 }
+
+GridUnopClass(UnaryTraceSpin, traceIndex<SpinIndex>(a));
+GRID_DEF_UNOP(traceSpin, UnaryTraceSpin);
+
 template<int Index,class vobj>
 inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(vobj()))>
 {
   return traceIndex<ColourIndex>(lhs);
 }
+
+GridUnopClass(UnaryTraceColour, traceIndex<ColourIndex>(a));
+GRID_DEF_UNOP(traceColour, UnaryTraceColour);
+
 template<int Index,class vobj>
 inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
 {
@@ -617,6 +633,8 @@ inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIn
   return traceIndex<ColourIndex>(lhs);
 }
 
+#undef GRID_UNOP
+#undef GRID_DEF_UNOP
 //////////////////////////////////////////
 // Current types
 //////////////////////////////////////////
diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h
index c3a46729..96afea0a 100644
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -136,9 +136,9 @@ class EmptyAction : public Action <GaugeField>
   using Action<GaugeField>::Sinitial;
   using Action<GaugeField>::deriv;
 
-  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
+  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { GRID_ASSERT(0);}; // refresh pseudofermions
   virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action
-  virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative
+  virtual void deriv(const GaugeField& U, GaugeField& dSdU) { GRID_ASSERT(0); };        // evaluate the action derivative
 
   ///////////////////////////////
   // Logging
diff --git a/Grid/qcd/action/ActionSet.h b/Grid/qcd/action/ActionSet.h
index e6879fe5..871893af 100644
--- a/Grid/qcd/action/ActionSet.h
+++ b/Grid/qcd/action/ActionSet.h
@@ -77,7 +77,7 @@ public:
     actions(std::get<0>(actions_hirep)), multiplier(mul) {
     // initialize the hirep vectors to zero.
     // apply(this->resize, actions_hirep, 0); //need a working resize
-    assert(mul >= 1);
+    GRID_ASSERT(mul >= 1);
   }
 
   template < class GenField >
diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h
index ec80b692..77491def 100644
--- a/Grid/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -126,7 +126,7 @@ public:
 
   // possible boost
   std::vector<ComplexD> qmu;
-  void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
+  void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; GRID_ASSERT(qmu.size()==Nd);};
   void addQmu(const FermionField &in, FermionField &out, int dag);
   
   // Cayley form Moebius (tanh and zolotarev)
diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h
index d94f31d4..cee2bc2f 100644
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@@ -181,7 +181,7 @@ public:
   }
 
   static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
-    assert(0);
+    GRID_ASSERT(0);
     return lambda;
   }
 
@@ -324,7 +324,7 @@ public:
   }
 
   static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
-    assert(0);
+    GRID_ASSERT(0);
     return lambda;
   }
 
diff --git a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
index d79b34d4..18287d8a 100644
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@@ -210,8 +210,8 @@ private:
 
   template<class Field>
   void ApplyBoundaryMask(Field& f) {
-    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
-    assert(m != nullptr);
+    const MaskField* m = getCorrectMaskField(f); GRID_ASSERT(m != nullptr);
+    GRID_ASSERT(m != nullptr);
     CompactHelpers::ApplyBoundaryMask(f, *m);
   }
 
diff --git a/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h b/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
index 2c6aa587..467a5d48 100644
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
@@ -164,8 +164,8 @@ private:
 
   template<class Field>
   void ApplyBoundaryMask(Field& f) {
-    const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
-    assert(m != nullptr);
+    const MaskField* m = getCorrectMaskField(f); GRID_ASSERT(m != nullptr);
+    GRID_ASSERT(m != nullptr);
     CompactHelpers::ApplyBoundaryMask(f, *m);
   }
 
diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
index 3fb84cd5..6a658672 100644
--- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -74,8 +74,8 @@ public:
     FermionField in_buf(in.Grid()); in_buf = Zero();
     typedef typename Simd::scalar_type Scalar;
     Scalar ci(0.0,1.0);
-    assert(twist.size() == Nd);//check that twist is Nd
-    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    GRID_ASSERT(twist.size() == Nd);//check that twist is Nd
+    GRID_ASSERT(boundary.size() == Nd);//check that boundary conditions is Nd
     int shift = 0;
     for(unsigned int nu = 0; nu < Nd; nu++)
       {
diff --git a/Grid/qcd/action/fermion/DWFSlow.h b/Grid/qcd/action/fermion/DWFSlow.h
index 61298504..0ffc3ec6 100644
--- a/Grid/qcd/action/fermion/DWFSlow.h
+++ b/Grid/qcd/action/fermion/DWFSlow.h
@@ -110,9 +110,9 @@ public:
   // Derivative interface
   ////////////////////////
   // Interface calls an internal routine
-  void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)  { assert(0);};
-  void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
-  void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
+  void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)  { GRID_ASSERT(0);};
+  void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ GRID_ASSERT(0);};
+  void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ GRID_ASSERT(0);};
 
   ///////////////////////////////////////////////////////////////
   // non-hermitian hopping term; half cb or both
@@ -128,7 +128,7 @@ public:
   void DhopOE(const FermionField &in, FermionField &out, int dag)
   {
     FermionField tmp(in.Grid());
-    assert(in.Checkerboard()==Even);
+    GRID_ASSERT(in.Checkerboard()==Even);
     Dhop5(in,out,MassFieldOdd,MassFieldEven,dag);
     for(int mu=0;mu<4;mu++){
       DhopDirU(in,UmuOdd[mu],UmuEven[mu],tmp,mu,dag );    out = out + tmp;
@@ -137,7 +137,7 @@ public:
   void DhopEO(const FermionField &in, FermionField &out, int dag)
   {
     FermionField tmp(in.Grid());
-    assert(in.Checkerboard()==Odd);
+    GRID_ASSERT(in.Checkerboard()==Odd);
     Dhop5(in,out, MassFieldEven,MassFieldOdd ,dag );  
     for(int mu=0;mu<4;mu++){
       DhopDirU(in,UmuEven[mu],UmuOdd[mu],tmp,mu,dag );    out = out + tmp;
@@ -147,11 +147,11 @@ public:
   ///////////////////////////////////////////////////////////////
   // Multigrid assistance; force term uses too
   ///////////////////////////////////////////////////////////////
-  void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ assert(0);};
-  void MdirAll(const FermionField &in, std::vector<FermionField> &out)   { assert(0);};
-  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { assert(0);};
-  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out)    { assert(0);};
-  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { assert(0);};
+  void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ GRID_ASSERT(0);};
+  void MdirAll(const FermionField &in, std::vector<FermionField> &out)   { GRID_ASSERT(0);};
+  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { GRID_ASSERT(0);};
+  void DhopDirAll(const FermionField &in, std::vector<FermionField> &out)    { GRID_ASSERT(0);};
+  void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { GRID_ASSERT(0);};
 
   void DhopDirU(const FermionField &in, const GaugeLinkField &U5e, const GaugeLinkField &U5o, FermionField &out, int mu, int dag)
   {
diff --git a/Grid/qcd/action/fermion/DomainWallFermion.h b/Grid/qcd/action/fermion/DomainWallFermion.h
index 5639debe..540ba3a6 100644
--- a/Grid/qcd/action/fermion/DomainWallFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallFermion.h
@@ -123,7 +123,7 @@ public:
     RealD eps = 1.0;
 
     Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
-    assert(zdata->n==this->Ls);
+    GRID_ASSERT(zdata->n==this->Ls);
 	
     //    std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
     // Call base setter
diff --git a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
index 0c8a0930..3de1ceb0 100644
--- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
+++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
@@ -134,25 +134,25 @@ public:
       
   inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
   {
-    assert(0);
+    GRID_ASSERT(0);
   }
 
   inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
-    assert(0);
+    GRID_ASSERT(0);
   } 
 
   inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
-    assert(0);
+    GRID_ASSERT(0);
   }
 
   inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
-    assert(0);
+    GRID_ASSERT(0);
   }
 
 
   inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
 
-    assert(0);
+    GRID_ASSERT(0);
     // Following lines to be revised after Peter's addition of half prec
     // missing put lane...
     /*
@@ -184,7 +184,7 @@ public:
       slocal_coor[0] = s;
       for (int s4d = 1; s4d< dimF; s4d++) slocal_coor[s4d] = local_coor[s4d-1];
       int sF = Bgrid->oIndexReduced(slocal_coor);  
-      assert(sF < Bgrid->oSites());
+      GRID_ASSERT(sF < Bgrid->oSites());
 
       extract(traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])), vres); 
       // sum across the 5d dimension
diff --git a/Grid/qcd/action/fermion/FermionOperator.h b/Grid/qcd/action/fermion/FermionOperator.h
index 66644d7f..98d4be14 100644
--- a/Grid/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -49,7 +49,7 @@ public:
 
   virtual FermionField &tmp(void) = 0;
 
-  virtual void DirichletBlock(const Coordinate & _Block) { assert(0); };
+  virtual void DirichletBlock(const Coordinate & _Block) { GRID_ASSERT(0); };
   
   GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
   GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };
@@ -93,7 +93,7 @@ public:
   virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
 
 
-  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
+  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { GRID_ASSERT(0);};
 
   virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) 
       {
diff --git a/Grid/qcd/action/fermion/FourierAcceleratedPV.h b/Grid/qcd/action/fermion/FourierAcceleratedPV.h
index bf23ff75..6d8e3f6c 100644
--- a/Grid/qcd/action/fermion/FourierAcceleratedPV.h
+++ b/Grid/qcd/action/fermion/FourierAcceleratedPV.h
@@ -38,11 +38,11 @@ NAMESPACE_BEGIN(Grid);
     c=m.cs[0];
     std::cout << GridLogMessage << "b=" << b << ", c=" << c << std::endl;
     for (size_t i=1;i<m.bs.size();i++) {
-      assert(m.bs[i] == b);
-      assert(m.cs[i] == c);
+      GRID_ASSERT(m.bs[i] == b);
+      GRID_ASSERT(m.cs[i] == c);
     }
-    assert(b.imag() == 0.0);
-    assert(c.imag() == 0.0);
+    GRID_ASSERT(b.imag() == 0.0);
+    GRID_ASSERT(c.imag() == 0.0);
     _b = b.real();
     _c = c.real();
   }
@@ -62,7 +62,7 @@ class FourierAcceleratedPV {
   FourierAcceleratedPV(M& _dwfPV, G& _Umu, ConjugateGradient<Vi> &_cg, int _group_in_s = 2) 
    : dwfPV(_dwfPV), Umu(_Umu), cg(_cg), group_in_s(_group_in_s) 
   {
-    assert( dwfPV.FermionGrid()->_fdimensions[0] % (2*group_in_s) == 0);
+    GRID_ASSERT( dwfPV.FermionGrid()->_fdimensions[0] % (2*group_in_s) == 0);
     grid5D   = SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu.Grid());
     gridRB5D = SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu.Grid());
   }
diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h
index 8017bc76..2649ffe7 100644
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -91,7 +91,7 @@ public:
 					  const _Spinor &chi, 
 					  int mu) 
   {
-    assert(0);
+    GRID_ASSERT(0);
   } 
 
   template<class _Spinor>
@@ -147,9 +147,9 @@ public:
     // Fixme X.Y.Z.T hardcode in stencil
     int mmu = mu % Nd;
         
-    // assert our assumptions
-    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
-    assert((sl == 1) || (sl == 2));
+    // GRID_ASSERT our assumptions
+    GRID_ASSERT((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
+    GRID_ASSERT((sl == 1) || (sl == 2));
 
     //If this site is an global boundary site, perform the G-parity flavor twist
     if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
@@ -162,7 +162,7 @@ public:
 
 	  St.iCoorFromIindex(icoor,s);
               
-	  assert((icoor[direction]==0)||(icoor[direction]==1));
+	  GRID_ASSERT((icoor[direction]==0)||(icoor[direction]==1));
               
 	  int permute_lane;
 	  if ( distance == 1) {
@@ -200,7 +200,7 @@ public:
 			    const _SpinorField & phi,
 			    int mu)
   {
-    assert(0);
+    GRID_ASSERT(0);
   }
 
   template <class ref>
@@ -344,11 +344,11 @@ public:
       
  inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
    //mat = outerProduct(Btilde, A);
-   assert(0);
+   GRID_ASSERT(0);
   }
 
   inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
-    assert(0);
+    GRID_ASSERT(0);
     /*
     auto tmp = TraceIndex<SpinIndex>(P);
     parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
@@ -358,7 +358,7 @@ public:
   }
 
   inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
-    assert(0);
+    GRID_ASSERT(0);
   }
  
   inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
index e567c19f..b202e301 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -125,7 +125,7 @@ public:
 			   const ImplParams &p = ImplParams());
 
   // DoubleStore impl dependent
-  void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
+  void ImportGauge      (const GaugeField &_Uthin ) { GRID_ASSERT(0); }
   void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
   void ImportGaugeSimple(const GaugeField &_UUU    ,const GaugeField &_U);
   void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
index ce65bfa3..5ce6d241 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -146,7 +146,7 @@ public:
 			     const ImplParams &p= ImplParams());
     
     // DoubleStore gauge field in operator
-    void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
+    void ImportGauge      (const GaugeField &_Uthin ) { GRID_ASSERT(0); }
   void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
     void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
     void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
diff --git a/Grid/qcd/action/fermion/MADWF.h b/Grid/qcd/action/fermion/MADWF.h
index 5d17e865..74f748ad 100644
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@@ -116,7 +116,7 @@ class MADWF
     ///////////////////////////////////////
     GridBase *src_grid = src.Grid();
 
-    assert( (src_grid == Mato.GaugeGrid()) || (src_grid == Mato.FermionGrid()));
+    GRID_ASSERT( (src_grid == Mato.GaugeGrid()) || (src_grid == Mato.FermionGrid()));
 
     if ( src_grid == Mato.GaugeGrid() ) {
       Mato.ImportPhysicalFermionSource(src,b);
@@ -204,7 +204,7 @@ class MADWF
     }
 
     std::cout << GridLogMessage << "MADWF : Exceeded maxiter "<<std::endl;
-    assert(0);
+    GRID_ASSERT(0);
 
   }
 
diff --git a/Grid/qcd/action/fermion/MobiusFermion.h b/Grid/qcd/action/fermion/MobiusFermion.h
index 1e948092..cb3c85fd 100644
--- a/Grid/qcd/action/fermion/MobiusFermion.h
+++ b/Grid/qcd/action/fermion/MobiusFermion.h
@@ -61,7 +61,7 @@ public:
 
     //    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
     Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
-    assert(zdata->n==this->Ls);
+    GRID_ASSERT(zdata->n==this->Ls);
 	
     // Call base setter
     this->SetCoefficientsTanh(zdata,b,c);
diff --git a/Grid/qcd/action/fermion/MobiusZolotarevFermion.h b/Grid/qcd/action/fermion/MobiusZolotarevFermion.h
index 48496773..53807fce 100644
--- a/Grid/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/MobiusZolotarevFermion.h
@@ -61,7 +61,7 @@ public:
     RealD eps = lo/hi;
 
     Approx::zolotarev_data *zdata = Approx::zolotarev(eps,this->Ls,0);
-    assert(zdata->n==this->Ls);
+    GRID_ASSERT(zdata->n==this->Ls);
 
     std::cout<<GridLogMessage << "MobiusZolotarevFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Zolotarev range ["<<lo<<","<<hi<<"]"<<std::endl;
 	
diff --git a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
index 5b603017..8b01d818 100644
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@@ -60,7 +60,7 @@ public:
 				     FourDimGrid,
 				     FourDimRedBlackGrid,_mass,_M5,p)
   {
-    assert((this->Ls&0x1)==1); // Odd Ls required
+    GRID_ASSERT((this->Ls&0x1)==1); // Odd Ls required
     int nrational=this->Ls-1;// Even rational order
     Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
     this->SetCoefficientsTanh(zdata,scale);
diff --git a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
index 747cb508..7e723d99 100644
--- a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@@ -59,7 +59,7 @@ public:
 				     FourDimGrid,
 				     FourDimRedBlackGrid,_mass,_M5,p)
   {
-    assert((this->Ls&0x1)==1); // Odd Ls required
+    GRID_ASSERT((this->Ls&0x1)==1); // Odd Ls required
 
     int nrational=this->Ls;// Odd rational order
     RealD eps = lo/hi;
diff --git a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
index 7210d6af..f3e3d934 100644
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@@ -60,7 +60,7 @@ public:
 				   FourDimGrid,
 				   FourDimRedBlackGrid,_mass,_M5,p)
   {
-    assert((this->Ls&0x1)==1); // Odd Ls required
+    GRID_ASSERT((this->Ls&0x1)==1); // Odd Ls required
     int nrational=this->Ls-1;// Even rational order
     Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);// eps is ignored for higham
     this->SetCoefficientsTanh(zdata,scale);
diff --git a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
index f0be4388..2558bb84 100644
--- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@@ -61,7 +61,7 @@ public:
 				   FourDimGrid,
 				   FourDimRedBlackGrid,_mass,_M5,p)
   {
-    assert((this->Ls&0x1)==1); // Odd Ls required
+    GRID_ASSERT((this->Ls&0x1)==1); // Odd Ls required
 
     int nrational=this->Ls;// Odd rational order
     RealD eps = lo/hi;
diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
index a71fc3f3..f2656a45 100644
--- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -104,8 +104,8 @@ public:
     FermionField in_buf(in.Grid()); in_buf = Zero();
     typedef typename Simd::scalar_type Scalar;
     Scalar ci(0.0,1.0);
-    assert(twist.size() == Nd);//check that twist is Nd
-    assert(boundary.size() == Nd);//check that boundary conditions is Nd
+    GRID_ASSERT(twist.size() == Nd);//check that twist is Nd
+    GRID_ASSERT(boundary.size() == Nd);//check that boundary conditions is Nd
     int shift = 0;
     for(unsigned int nu = 0; nu < Nd; nu++)
       {
@@ -137,7 +137,7 @@ public:
     FreePropagator(in,out,mass,boundary,twist);
   };
 
-  void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
+  void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; GRID_ASSERT(qmu.size()==Nd);};
   void addQmu(const FermionField &in, FermionField &out, int dag);
 
 protected:
diff --git a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
index 00ac222f..3fe2be27 100644
--- a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
+++ b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
@@ -51,7 +51,7 @@ public:
   void sscale(const Lattice<vobj>& in, Lattice<vobj>& out, Coeff_t* s) {
     GridBase *grid=out.Grid();
     out.Checkerboard() = in.Checkerboard();
-    assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now
+    GRID_ASSERT(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now
     int Ls = grid->_rdimensions[0];
     thread_for(ss, grid->oSites(),
     {
diff --git a/Grid/qcd/action/fermion/StaggeredImpl.h b/Grid/qcd/action/fermion/StaggeredImpl.h
index f44d12f4..fee7613f 100644
--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@@ -168,7 +168,7 @@ public:
   }   
       
   inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
-    assert (0); 
+    GRID_ASSERT (0); 
     // Must never hit
   }
 };
diff --git a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
index 18fe993c..b2e215e3 100644
--- a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h
@@ -113,7 +113,7 @@ public:
       
   inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
   {
-    assert(0);
+    GRID_ASSERT(0);
   }
   inline void DoubleStore(GridBase *GaugeGrid,
 			  DoubledGaugeField &UUUds, // for Naik term
@@ -176,11 +176,11 @@ public:
   }
 
   inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
-    assert(0);
+    GRID_ASSERT(0);
   }   
       
   inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
-    assert (0); 
+    GRID_ASSERT (0); 
   }
 };
 typedef StaggeredVec5dImpl<vComplex,  FundamentalRepresentation > StaggeredVec5dImplR;   // Real.. whichever prec
diff --git a/Grid/qcd/action/fermion/WilsonCloverHelpers.h b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
index c221d5f0..926d50b7 100644
--- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h
+++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h
@@ -740,9 +740,9 @@ public:
 
   template<class MaskField>
   static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
-    assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
-    assert(odd.Grid()->_isCheckerBoarded  && odd.Checkerboard()  == Odd);
-    assert(!full.Grid()->_isCheckerBoarded);
+    GRID_ASSERT(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
+    GRID_ASSERT(odd.Grid()->_isCheckerBoarded  && odd.Checkerboard()  == Odd);
+    GRID_ASSERT(!full.Grid()->_isCheckerBoarded);
 
     GridBase* grid = full.Grid();
     int t_dir = Nd-1;
diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h
index 458f2c83..c3a5e4cf 100644
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -241,7 +241,7 @@ public:
 
     this->_grid->StencilBarrier();
 
-    assert(source.Grid()==this->_grid);
+    GRID_ASSERT(source.Grid()==this->_grid);
     
     this->u_comm_offset=0;
       
@@ -278,7 +278,7 @@ public:
       vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx));
     }
     this->face_table_computed=1;
-    assert(this->u_comm_offset==this->_unified_buffer_size);
+    GRID_ASSERT(this->u_comm_offset==this->_unified_buffer_size);
     accelerator_barrier();
 #ifdef NVLINK_GET
     this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h
index 07248160..3643e581 100644
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -77,7 +77,7 @@ public:
   ImplParams Params;
 
   WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
-    assert(Params.boundary_phases.size() == Nd);
+    GRID_ASSERT(Params.boundary_phases.size() == Nd);
   };
 
   template<class _Spinor>
diff --git a/Grid/qcd/action/fermion/WilsonTMFermion5D.h b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
index 982e722a..e1f07e23 100644
--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@@ -136,8 +136,8 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
   // needed for fast PV
   void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) 
   {
-    assert(_mass.size() == _mu.size());
-    assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
+    GRID_ASSERT(_mass.size() == _mu.size());
+    GRID_ASSERT(_mass.size() == this->FermionGrid()->_fdimensions[0]);
     this->mass = _mass;
     this->mu = _mu;
   }
diff --git a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
index f1acb50c..308581c4 100644
--- a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
+++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Ddense.h
@@ -59,7 +59,7 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
   
   chi.Checkerboard()=psi.Checkerboard();
   
-  assert(Ls==LLs);
+  GRID_ASSERT(Ls==LLs);
   
   Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
   Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
diff --git a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h
index 478fbb8b..75c53eb2 100644
--- a/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h
+++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h
@@ -78,8 +78,8 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
   Vector<iSinglet<Simd> > l(LLs);
   Vector<iSinglet<Simd> > d(LLs);
 
-  assert(Ls/LLs==nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(Ls/LLs==nsimd);
+  GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
 
   // just directly address via type pun
   typedef typename Simd::scalar_type scalar_type;
@@ -96,7 +96,7 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
       d_p[ss] = diag[s];
     }}
 
-  assert(Nc==3);
+  GRID_ASSERT(Nc==3);
 
   thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
 #if 0
@@ -221,8 +221,8 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
   Vector<iSinglet<Simd> > l(LLs);
   Vector<iSinglet<Simd> > d(LLs);
 
-  assert(Ls/LLs==nsimd);
-  assert(phi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(Ls/LLs==nsimd);
+  GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
 
   // just directly address via type pun
   typedef typename Simd::scalar_type scalar_type;
@@ -805,7 +805,7 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
     _Matp = &Matp;
     _Matm = &Matm;
   }
-  assert(_Matp->size()==Ls*LLs);
+  GRID_ASSERT(_Matp->size()==Ls*LLs);
 
   if ( switcheroo<Coeff_t>::iscomplex() ) {
     thread_loop( (auto site=0;site<vol;site++),{
diff --git a/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h b/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
index 3ec4fbac..691152f9 100644
--- a/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
+++ b/Grid/qcd/action/fermion/deprecated/DomainWallEOFAFermiondense.h
@@ -60,7 +60,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, Fermion
 
   chi.Checkerboard() = psi.Checkerboard();
 
-  assert(Ls==LLs);
+  GRID_ASSERT(Ls==LLs);
 
   Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
   Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
diff --git a/Grid/qcd/action/fermion/deprecated/Lebesgue.cc b/Grid/qcd/action/fermion/deprecated/Lebesgue.cc
index 480483ed..f36590db 100644
--- a/Grid/qcd/action/fermion/deprecated/Lebesgue.cc
+++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.cc
@@ -98,8 +98,8 @@ void LebesgueOrder::CartesianBlocking(void)
 
   IndexInteger ND = grid->_ndimension;
 
-  assert(ND==4);
-  assert(ND==Block.size());
+  GRID_ASSERT(ND==4);
+  GRID_ASSERT(ND==Block.size());
 
   Coordinate dims(ND);
   Coordinate xo(ND,0);
@@ -164,7 +164,7 @@ void LebesgueOrder::ZGraph(void)
   
   for(IndexInteger mu=0;mu<ND;mu++){
     dims[mu] = grid->_rdimensions[mu];
-    assert ( dims[mu] != 0 );
+    GRID_ASSERT ( dims[mu] != 0 );
     adims[mu] = alignup(dims[mu]);
   }
   
@@ -221,11 +221,11 @@ void LebesgueOrder::ZGraph(void)
 	+dims[0]*dims[1]*ax[2]
 	+dims[0]*dims[1]*dims[2]*ax[3];
 
-      assert(site < vol);
+      GRID_ASSERT(site < vol);
       _LebesgueReorder.push_back(site);
     }
   }
-  assert( _LebesgueReorder.size() == vol );
+  GRID_ASSERT( _LebesgueReorder.size() == vol );
 
   /*
     std::vector<int> coor(4);
diff --git a/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h b/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
index 8091f344..456cecc1 100644
--- a/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
+++ b/Grid/qcd/action/fermion/deprecated/MobiusEOFAFermiondense.h
@@ -78,7 +78,7 @@ void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionFiel
 
   chi.Checkerboard() = psi.Checkerboard();
 
-  assert(Ls==LLs);
+  GRID_ASSERT(Ls==LLs);
 
   Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
   Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
index 2ace6c18..60f6c223 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -285,7 +285,7 @@ void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, in
     std::vector<ComplexD> coeff(Nd);
     ComplexD ci(0,1);
 
-    assert(qmu.size()==Nd);
+    GRID_ASSERT(qmu.size()==Nd);
 
     for(int mu=0;mu<Nd;mu++){
        coeff[mu] = ci*qmu[mu];
@@ -451,7 +451,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
   ///////////////////////////////////////////////////////////
   // The Cayley coeffs (unprec)
   ///////////////////////////////////////////////////////////
-  assert(gamma.size()==Ls);
+  GRID_ASSERT(gamma.size()==Ls);
 
   omega.resize(Ls);
   bs.resize(Ls);
@@ -489,7 +489,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
   for(int i=0; i < Ls; i++){
     as[i] = 1.0;
     omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
-    assert(omega[i]!=Coeff_t(0.0));
+    GRID_ASSERT(omega[i]!=Coeff_t(0.0));
     bs[i] = 0.5*(bpc/omega[i] + bmc);
     cs[i] = 0.5*(bpc/omega[i] - bmc);
   }
@@ -504,7 +504,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
   
   for(int i=0;i<Ls;i++){
     bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-    assert(bee[i]!=Coeff_t(0.0));
+    GRID_ASSERT(bee[i]!=Coeff_t(0.0));
     cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
     beo[i]=as[i]*bs[i];
     ceo[i]=-as[i]*cs[i];
@@ -531,8 +531,8 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
     
     if ( i < Ls-1 ) {
 
-      assert(bee[i]!=Coeff_t(0.0));
-      assert(bee[0]!=Coeff_t(0.0));
+      GRID_ASSERT(bee[i]!=Coeff_t(0.0));
+      GRID_ASSERT(bee[0]!=Coeff_t(0.0));
       
       lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
       
@@ -559,7 +559,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
   { 
     Coeff_t delta_d=mass_minus*cee[Ls-1];
     for(int j=0;j<Ls-1;j++) {
-      assert(bee[j] != Coeff_t(0.0));
+      GRID_ASSERT(bee[j] != Coeff_t(0.0));
       delta_d *= cee[j]/bee[j];
     }
     dee[Ls-1] += delta_d;
@@ -639,7 +639,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      unsigned int mu)
 {
 
-  assert(mass_plus == mass_minus);
+  GRID_ASSERT(mass_plus == mass_minus);
   RealD mass = mass_plus;
   
   Gamma::Algebra Gmu [] = {
@@ -773,10 +773,10 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                                 unsigned int tmax,
 						ComplexField &ph)// Complex phase factor
 {
-  assert(mu>=0);
-  assert(mu<Nd);
+  GRID_ASSERT(mu>=0);
+  GRID_ASSERT(mu<Nd);
 
-  assert(mass_plus == mass_minus);
+  GRID_ASSERT(mass_plus == mass_minus);
   RealD mass = mass_plus;
 
 #if 0
@@ -895,7 +895,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
     }
     else {
       std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
-      assert(b==1 && c==0);
+      GRID_ASSERT(b==1 && c==0);
     }
   }
 
diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
index 5fbc7612..4bc703a3 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -53,7 +53,7 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
   autoView(psi , psi_i,AcceleratorRead);
   autoView(phi , phi_i,AcceleratorRead);
   autoView(chi , chi_i,AcceleratorWrite);
-  assert(phi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
 
   int Ls =this->Ls;
 
@@ -95,7 +95,7 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
   autoView(psi , psi_i,AcceleratorRead);
   autoView(phi , phi_i,AcceleratorRead);
   autoView(chi , chi_i,AcceleratorWrite);
-  assert(phi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
 
   int Ls=this->Ls;
 
@@ -208,7 +208,7 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
   auto pleem = & d_leem[0];
   auto pueem = & d_ueem[0];
 
-  assert(psi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(psi.Checkerboard() == psi.Checkerboard());
 
   uint64_t nloop = grid->oSites()/Ls;
   accelerator_for(sss,nloop,Simd::Nsimd(),{
diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
index e65fb6d6..751efee2 100644
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h
@@ -61,7 +61,7 @@ CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(
   , BoundaryMask(&FiveDimGrid)
   , BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
 {
-  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
+  GRID_ASSERT(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
 
   csw_r *= 0.5;
   csw_t *= 0.5;
@@ -188,7 +188,7 @@ void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionFie
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
-  assert(!fixedBoundaries); // TODO check for changes required for open bc
+  GRID_ASSERT(!fixedBoundaries); // TODO check for changes required for open bc
 
   // NOTE: code copied from original clover term
   conformable(X.Grid(), Y.Grid());
@@ -271,12 +271,12 @@ void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl, class CloverHelpers>
@@ -284,7 +284,7 @@ void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const Ferm
 								      FermionField&              out,
 								      const CloverDiagonalField& diagonal,
 								      const CloverTriangleField& triangle) {
-  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+  GRID_ASSERT(in.Checkerboard() == Odd || in.Checkerboard() == Even);
   out.Checkerboard() = in.Checkerboard();
   conformable(in, out);
   CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
index 7e3b7f00..a4f32850 100644
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -59,7 +59,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
   , BoundaryMask(&Fgrid)
   , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
 {
-  assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
+  GRID_ASSERT(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
 
   csw_r *= 0.5;
   csw_t *= 0.5;
@@ -186,7 +186,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
-  assert(!fixedBoundaries); // TODO check for changes required for open bc
+  GRID_ASSERT(!fixedBoundaries); // TODO check for changes required for open bc
 
   // NOTE: code copied from original clover term
   conformable(X.Grid(), Y.Grid());
@@ -269,12 +269,12 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force,
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl, class CloverHelpers>
@@ -282,7 +282,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const Fermio
                     FermionField&              out,
                     const CloverDiagonalField& diagonal,
                     const CloverTriangleField& triangle) {
-  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+  GRID_ASSERT(in.Checkerboard() == Odd || in.Checkerboard() == Even);
   out.Checkerboard() = in.Checkerboard();
   conformable(in, out);
   conformable(in, diagonal);
diff --git a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
index 4bfbd31e..f0f8bb63 100644
--- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h
@@ -49,7 +49,7 @@ void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Ap
   std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
   int Ls = this->Ls;
   std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
-  assert(zdata->db==Ls);// Beta has Ls coeffs
+  GRID_ASSERT(zdata->db==Ls);// Beta has Ls coeffs
 
   R=(1+this->mass)/(1-this->mass);
 
@@ -311,7 +311,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
   mass(_mass)
 {
   int Ls = this->Ls;
-  assert((Ls&0x1)==1); // Odd Ls required
+  GRID_ASSERT((Ls&0x1)==1); // Odd Ls required
 }
 
     template<class Impl>
diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
index ae126bb5..589c6fdd 100644
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@@ -49,7 +49,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
   autoView( phi , phi_i, AcceleratorRead);
   autoView( psi , psi_i, AcceleratorRead);
   autoView( chi , chi_i, AcceleratorWrite);
-  assert(phi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
 
   auto pdiag  = &this->d_diag[0];
   auto pupper = &this->d_upper[0];
@@ -88,7 +88,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
   autoView( psi , psi_i, AcceleratorRead);
   autoView( phi , phi_i, AcceleratorRead);
   autoView( chi , chi_i, AcceleratorWrite);
-  assert(phi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
   
   auto pdiag  = &this->d_diag[0];
   auto pupper = &this->d_upper[0];
@@ -190,7 +190,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
   auto pleem = & this->leem[0];
   auto pueem = & this->ueem[0];
 
-  assert(psi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(psi.Checkerboard() == psi.Checkerboard());
 
   auto nloop = grid->oSites()/Ls;
   accelerator_for(sss,nloop,Simd::Nsimd(),{
diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
index 53b44ca2..8bb98d8b 100644
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
@@ -53,7 +53,7 @@ DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
 {
   RealD eps = 1.0;
   Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
-  assert(zdata->n == this->Ls);
+  GRID_ASSERT(zdata->n == this->Ls);
 
   std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
   this->SetCoefficientsTanh(zdata, 1.0, 0.0);
diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
index d2b4450e..fe05b627 100644
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h
@@ -64,32 +64,32 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
   _tmp(&FiveDimRedBlackGrid)
 {
 
-  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==4);
-  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==5);
-  assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+  // some GRID_ASSERTions
+  GRID_ASSERT(FiveDimGrid._ndimension==5);
+  GRID_ASSERT(FourDimGrid._ndimension==4);
+  GRID_ASSERT(FourDimRedBlackGrid._ndimension==4);
+  GRID_ASSERT(FiveDimRedBlackGrid._ndimension==5);
+  GRID_ASSERT(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
 
   // extent of fifth dim and not spread out
   Ls=FiveDimGrid._fdimensions[0];
-  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-  assert(FiveDimGrid._processors[0]         ==1);
-  assert(FiveDimRedBlackGrid._processors[0] ==1);
+  GRID_ASSERT(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+  GRID_ASSERT(FiveDimGrid._processors[0]         ==1);
+  GRID_ASSERT(FiveDimRedBlackGrid._processors[0] ==1);
 
   // Other dimensions must match the decomposition of the four-D fields 
   for(int d=0;d<4;d++){
-    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
-    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
-    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+    GRID_ASSERT(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+    GRID_ASSERT(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+    GRID_ASSERT(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
 
-    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
-    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+    GRID_ASSERT(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+    GRID_ASSERT(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    GRID_ASSERT(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
 
-    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
-    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
-    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
+    GRID_ASSERT(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    GRID_ASSERT(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+    GRID_ASSERT(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
   }
 
   if (Impl::LsVectorised) { 
@@ -97,20 +97,20 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
     int nsimd = Simd::Nsimd();
     
     // Dimension zero of the five-d is the Ls direction
-    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
-    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
+    GRID_ASSERT(FiveDimGrid._simd_layout[0]        ==nsimd);
+    GRID_ASSERT(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
 
     for(int d=0;d<4;d++){
-      assert(FourDimGrid._simd_layout[d]==1);
-      assert(FourDimRedBlackGrid._simd_layout[d]==1);
-      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
+      GRID_ASSERT(FourDimGrid._simd_layout[d]==1);
+      GRID_ASSERT(FourDimRedBlackGrid._simd_layout[d]==1);
+      GRID_ASSERT(FiveDimRedBlackGrid._simd_layout[d+1]==1);
     }
 
   } else {
     
     // Dimension zero of the five-d is the Ls direction
-    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
-    assert(FiveDimGrid._simd_layout[0]        ==1);
+    GRID_ASSERT(FiveDimRedBlackGrid._simd_layout[0]==1);
+    GRID_ASSERT(FiveDimGrid._simd_layout[0]        ==1);
 
   }
   int LLs = FiveDimGrid._rdimensions[0];
@@ -242,7 +242,7 @@ void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,
 						     int dag)
 {
   // No force terms in multi-rhs solver staggered
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl>
@@ -251,7 +251,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDeriv(GaugeField &mat,
 						 const FermionField &B,
 						 int dag)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl>
@@ -260,7 +260,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 						   const FermionField &B,
 						   int dag)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 
@@ -270,7 +270,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 						   const FermionField &B,
 						   int dag)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 /*CHANGE */
@@ -290,7 +290,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
 								   DoubledGaugeField & U,DoubledGaugeField & UUU,
 								   const FermionField &in, FermionField &out,int dag)
 {
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+  //  GRID_ASSERT((dag==DaggerNo) ||(dag==DaggerYes));
   Compressor compressor; 
 
   int LLs = in.Grid()->_rdimensions[0];
@@ -352,7 +352,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
   conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
   conformable(in.Grid(),out.Grid()); // drops the cb check
 
-  assert(in.Checkerboard()==Even);
+  GRID_ASSERT(in.Checkerboard()==Even);
   out.Checkerboard() = Odd;
 
   DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag);
@@ -363,7 +363,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
   conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
   conformable(in.Grid(),out.Grid()); // drops the cb check
 
-  assert(in.Checkerboard()==Odd);
+  GRID_ASSERT(in.Checkerboard()==Odd);
   out.Checkerboard() = Even;
 
   DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag);
@@ -390,7 +390,7 @@ void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField
 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) 
@@ -467,7 +467,7 @@ void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField
 								Current curr_type,
 								unsigned int mu)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template <class Impl>
@@ -480,7 +480,7 @@ void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in
 							   unsigned int tmax,
 							   ComplexField &lattice_cmplx)
 {
-  assert(0);
+  GRID_ASSERT(0);
 
 }
   
diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
index bd9dd132..a1eebf2f 100644
--- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
@@ -241,7 +241,7 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
 						   GaugeField & mat,
 						   const FermionField &A, const FermionField &B, int dag) 
 {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
+  GRID_ASSERT((dag == DaggerNo) || (dag == DaggerYes));
 
   Compressor compressor;
 
@@ -284,7 +284,7 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
     // mat+= outer ( AU, UUB) <-- and then use covariant cshift?
     // mat+= outer ( AUU, UB) <-- Returned from call to DhopDir
 
-    assert(0);// need to figure out the force interface with a blasted three link term.
+    GRID_ASSERT(0);// need to figure out the force interface with a blasted three link term.
     
   }
 }
@@ -308,8 +308,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionF
   conformable(U.Grid(), V.Grid());
   conformable(U.Grid(), mat.Grid());
 
-  assert(V.Checkerboard() == Even);
-  assert(U.Checkerboard() == Odd);
+  GRID_ASSERT(V.Checkerboard() == Even);
+  GRID_ASSERT(U.Checkerboard() == Odd);
   mat.Checkerboard() = Odd;
 
   DerivInternal(StencilEven, UmuOdd, UUUmuOdd, mat, U, V, dag);
@@ -322,8 +322,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
   conformable(U.Grid(), V.Grid());
   conformable(U.Grid(), mat.Grid());
 
-  assert(V.Checkerboard() == Odd);
-  assert(U.Checkerboard() == Even);
+  GRID_ASSERT(V.Checkerboard() == Odd);
+  GRID_ASSERT(U.Checkerboard() == Even);
   mat.Checkerboard() = Even;
 
   DerivInternal(StencilOdd, UmuEven, UUUmuEven, mat, U, V, dag);
@@ -346,7 +346,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
-  assert(in.Checkerboard() == Even);
+  GRID_ASSERT(in.Checkerboard() == Even);
   out.Checkerboard() = Odd;
 
   DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag);
@@ -358,7 +358,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
-  assert(in.Checkerboard() == Odd);
+  GRID_ASSERT(in.Checkerboard() == Odd);
   out.Checkerboard() = Even;
 
   DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag);
@@ -372,7 +372,7 @@ void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
-  assert(0); // Not implemented yet
+  GRID_ASSERT(0); // Not implemented yet
 }
 
 template <class Impl>
@@ -450,7 +450,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
 							     const FermionField &in,
 							     FermionField &out, int dag) 
 {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
+  GRID_ASSERT((dag == DaggerNo) || (dag == DaggerYes));
 
   Compressor compressor;
   st.HaloExchange(in, compressor);
@@ -473,7 +473,7 @@ void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q
 							      Current curr_type,
 							      unsigned int mu)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template <class Impl>
@@ -486,7 +486,7 @@ void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                               unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
-  assert(0);
+  GRID_ASSERT(0);
 
 }
 
diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
index b9165edb..105fc4c7 100644
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@@ -48,7 +48,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
   autoView(phi , phi_i, AcceleratorRead);
   autoView(chi , chi_i, AcceleratorWrite);
 
-  assert(phi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
 
   auto pdiag  = &this->d_diag[0];
   auto pupper = &this->d_upper[0];
@@ -91,7 +91,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
   auto pm  = this->pm;
   int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
   
-  assert(phi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
 
   auto pdiag  = &this->d_diag[0];
   auto pupper = &this->d_upper[0];
@@ -137,7 +137,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
   autoView(phi , phi_i, AcceleratorRead);
   autoView(chi , chi_i, AcceleratorWrite);
 
-  assert(phi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
   
   auto pdiag  = &this->d_diag[0];
   auto pupper = &this->d_upper[0];
@@ -178,7 +178,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
   autoView(phi , phi_i, AcceleratorRead);
   autoView(chi , chi_i, AcceleratorWrite);
 
-  assert(phi.Checkerboard() == psi.Checkerboard());
+  GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
 
   auto pdiag  = &this->d_diag[0];
   auto pupper = &this->d_upper[0];
diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
index 70f06dfc..05d94a0a 100644
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h
@@ -55,7 +55,7 @@ MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
 
   RealD eps = 1.0;
   Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
-  assert(zdata->n == this->Ls);
+  GRID_ASSERT(zdata->n == this->Ls);
 
   std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
     ",c=" << _c << ") with Ls=" << Ls << std::endl;
diff --git a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
index b596dc44..4fd8d09e 100644
--- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h
@@ -191,7 +191,7 @@ void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeFie
 						GaugeField & mat,
 						const FermionField &A, const FermionField &B, int dag) 
 {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
+  GRID_ASSERT((dag == DaggerNo) || (dag == DaggerYes));
 
   Compressor compressor;
 
@@ -213,7 +213,7 @@ void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeFie
       Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
     });
 
-    assert(0);// need to figure out the force interface with a blasted three link term.
+    GRID_ASSERT(0);// need to figure out the force interface with a blasted three link term.
     
   }
 }
@@ -237,8 +237,8 @@ void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionFiel
   conformable(U.Grid(), V.Grid());
   conformable(U.Grid(), mat.Grid());
 
-  assert(V.Checkerboard() == Even);
-  assert(U.Checkerboard() == Odd);
+  GRID_ASSERT(V.Checkerboard() == Even);
+  GRID_ASSERT(U.Checkerboard() == Odd);
   mat.Checkerboard() = Odd;
 
   DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
@@ -251,8 +251,8 @@ void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionFiel
   conformable(U.Grid(), V.Grid());
   conformable(U.Grid(), mat.Grid());
 
-  assert(V.Checkerboard() == Odd);
-  assert(U.Checkerboard() == Even);
+  GRID_ASSERT(V.Checkerboard() == Odd);
+  GRID_ASSERT(U.Checkerboard() == Even);
   mat.Checkerboard() = Even;
 
   DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
@@ -275,7 +275,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
-  assert(in.Checkerboard() == Even);
+  GRID_ASSERT(in.Checkerboard() == Even);
   out.Checkerboard() = Odd;
 
   DhopInternal(StencilEven, UmuOdd, in, out, dag);
@@ -287,7 +287,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
-  assert(in.Checkerboard() == Odd);
+  GRID_ASSERT(in.Checkerboard() == Odd);
   out.Checkerboard() = Even;
 
   DhopInternal(StencilOdd, UmuEven, in, out, dag);
@@ -301,7 +301,7 @@ void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out
 template <class Impl>
 void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out) 
 {
-  assert(0); // Not implemented yet
+  GRID_ASSERT(0); // Not implemented yet
 }
 
 template <class Impl>
@@ -316,7 +316,7 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &
   //  thread_for( sss, in.Grid()->oSites(),{
   //    Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
   //  });
-  assert(0);
+  GRID_ASSERT(0);
 };
 
 
@@ -375,7 +375,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
 							  const FermionField &in,
 							  FermionField &out, int dag) 
 {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
+  GRID_ASSERT((dag == DaggerNo) || (dag == DaggerYes));
 
   Compressor compressor;
   st.HaloExchange(in, compressor);
@@ -398,7 +398,7 @@ void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in
 							      Current curr_type,
 							      unsigned int mu)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template <class Impl>
@@ -411,7 +411,7 @@ void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                               unsigned int tmax,
 					      ComplexField &lattice_cmplx)
 {
-  assert(0);
+  GRID_ASSERT(0);
 
 }
 
diff --git a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
index 84884c6d..6115bf9b 100644
--- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h
@@ -245,7 +245,7 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
   if ( qmu.size() ) {
 
     std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
-    assert(qmu.size()==Nd);
+    GRID_ASSERT(qmu.size()==Nd);
 
     FermionField qslash_psi(psi.Grid());
 
@@ -446,7 +446,7 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
   //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
   int Ls = this->Ls;
 
-  assert(Ls == (2*zdata->da -1) );
+  GRID_ASSERT(Ls == (2*zdata->da -1) );
 
   // Part frac
   //      RealD R;
@@ -509,7 +509,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
 {
   int Ls = this->Ls;
   qmu.resize(0);
-  assert((Ls&0x1)==1); // Odd Ls required
+  GRID_ASSERT((Ls&0x1)==1); // Odd Ls required
   int nrational=Ls-1;
 
 
diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
index e9cacbcf..4857c25a 100644
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
@@ -624,7 +624,7 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
 					 SiteSpinor *buf, int sF,
 					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
-  assert(0);
+  GRID_ASSERT(0);
 };
 
 
@@ -733,7 +733,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView
     }
    }
 #else 
-    assert(0);
+    GRID_ASSERT(0);
 #endif
    
 }
@@ -787,7 +787,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
     }
   }
 #else 
-  assert(0);
+  GRID_ASSERT(0);
 #endif
 }
    
@@ -889,7 +889,7 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
     }
   }
 #else 
-  assert(0);
+  GRID_ASSERT(0);
 #endif
 }
 
@@ -958,7 +958,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
     }
   }
 #else 
-  assert(0);
+  GRID_ASSERT(0);
 #endif
 }
 
diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
index 05dbf3b2..7a981eb5 100644
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -234,7 +234,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
   // What about "dag" ?
   // Because we work out pU . dS/dU 
   // U
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 #define KERNEL_CALLNB(A,improved)					\
@@ -291,7 +291,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
   }
-  assert(0 && " Kernel optimisation case not covered ");
+  GRID_ASSERT(0 && " Kernel optimisation case not covered ");
 }
 template <class Impl> 
 void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, 
diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
index 2a7e7535..a4cda1b1 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -54,7 +54,7 @@ WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&
   , CloverTermDagOdd(&Hgrid)
   , CloverTermInvDagEven(&Hgrid)
   , CloverTermInvDagOdd(&Hgrid) {
-  assert(Nd == 4); // require 4 dimensions
+  GRID_ASSERT(Nd == 4); // require 4 dimensions
 
   if(clover_anisotropy.isAnisotropic) {
     csw_r     = _csw_r * 0.5 / clover_anisotropy.xi_0;
@@ -189,7 +189,7 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField
 {
   out.Checkerboard() = in.Checkerboard();
   CloverField *Clover;
-  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+  GRID_ASSERT(in.Checkerboard() == Odd || in.Checkerboard() == Even);
 
   if (dag)
   {
@@ -323,14 +323,14 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
 template<class Impl, class CloverHelpers>
 void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 // Derivative parts
 template<class Impl, class CloverHelpers>
 void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
-  assert(0); // not implemented yet
+  GRID_ASSERT(0); // not implemented yet
 }
 
 NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
index 9598552f..48ec71bf 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -62,33 +62,33 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
   _tmp(&FiveDimRedBlackGrid),
   Dirichlet(0)
 {
-  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==4);
-  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==5);
-  assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+  // some GRID_ASSERTions
+  GRID_ASSERT(FiveDimGrid._ndimension==5);
+  GRID_ASSERT(FourDimGrid._ndimension==4);
+  GRID_ASSERT(FourDimRedBlackGrid._ndimension==4);
+  GRID_ASSERT(FiveDimRedBlackGrid._ndimension==5);
+  GRID_ASSERT(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
 
   // extent of fifth dim and not spread out
   Ls=FiveDimGrid._fdimensions[0];
-  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-  assert(FiveDimGrid._processors[0]         ==1);
-  assert(FiveDimRedBlackGrid._processors[0] ==1);
+  GRID_ASSERT(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+  GRID_ASSERT(FiveDimGrid._processors[0]         ==1);
+  GRID_ASSERT(FiveDimRedBlackGrid._processors[0] ==1);
 
   // Other dimensions must match the decomposition of the four-D fields 
   for(int d=0;d<4;d++){
 
-    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
-    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
-    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+    GRID_ASSERT(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+    GRID_ASSERT(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+    GRID_ASSERT(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
 
-    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
-    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+    GRID_ASSERT(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+    GRID_ASSERT(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    GRID_ASSERT(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
 
-    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
-    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
-    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
+    GRID_ASSERT(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    GRID_ASSERT(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
+    GRID_ASSERT(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
   }
 
   if ( p.dirichlet.size() == Nd+1) {
@@ -109,20 +109,20 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
     int nsimd = Simd::Nsimd();
     
     // Dimension zero of the five-d is the Ls direction
-    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
-    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
+    GRID_ASSERT(FiveDimGrid._simd_layout[0]        ==nsimd);
+    GRID_ASSERT(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
 
     for(int d=0;d<4;d++){
-      assert(FourDimGrid._simd_layout[d]==1);
-      assert(FourDimRedBlackGrid._simd_layout[d]==1);
-      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
+      GRID_ASSERT(FourDimGrid._simd_layout[d]==1);
+      GRID_ASSERT(FourDimRedBlackGrid._simd_layout[d]==1);
+      GRID_ASSERT(FiveDimRedBlackGrid._simd_layout[d+1]==1);
     }
 
   } else {
     
     // Dimension zero of the five-d is the Ls direction
-    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
-    assert(FiveDimGrid._simd_layout[0]        ==1);
+    GRID_ASSERT(FiveDimRedBlackGrid._simd_layout[0]==1);
+    GRID_ASSERT(FiveDimGrid._simd_layout[0]        ==1);
 
   }
     
@@ -157,7 +157,7 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
     for(int d=0;d<Nd;d++) {
       int GaugeBlock = Block[d+1];
       int ldim=GaugeGrid()->LocalDimensions()[d];
-      if (GaugeBlock) assert( (GaugeBlock%ldim)==0);
+      if (GaugeBlock) GRID_ASSERT( (GaugeBlock%ldim)==0);
     }
 
     if (!this->Params.partialDirichlet) {
@@ -179,8 +179,8 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
 {
   int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
                     // we drop off the innermost fifth dimension
-  //  assert( (disp==1)||(disp==-1) );
-  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
+  //  GRID_ASSERT( (disp==1)||(disp==-1) );
+  //  GRID_ASSERT( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
 
   int skip = (disp==1) ? 0 : 1;
   int dirdisp = dir+skip*4;
@@ -211,7 +211,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 					  const FermionField &B,
 					  int dag)
 {
-  assert((dag==DaggerNo) ||(dag==DaggerYes));
+  GRID_ASSERT((dag==DaggerNo) ||(dag==DaggerYes));
 
   conformable(st.Grid(),A.Grid());
   conformable(st.Grid(),B.Grid());
@@ -275,8 +275,8 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
   conformable(A.Grid(),FermionRedBlackGrid());
   conformable(A.Grid(),B.Grid());
 
-  assert(B.Checkerboard()==Odd);
-  assert(A.Checkerboard()==Even);
+  GRID_ASSERT(B.Checkerboard()==Odd);
+  GRID_ASSERT(A.Checkerboard()==Even);
   mat.Checkerboard() = Even;
 
   DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
@@ -292,8 +292,8 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
   conformable(A.Grid(),FermionRedBlackGrid());
   conformable(A.Grid(),B.Grid());
 
-  assert(B.Checkerboard()==Even);
-  assert(A.Checkerboard()==Odd);
+  GRID_ASSERT(B.Checkerboard()==Even);
+  GRID_ASSERT(A.Checkerboard()==Odd);
   mat.Checkerboard() = Odd;
 
   DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
@@ -429,7 +429,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
   conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
   conformable(in.Grid(),out.Grid()); // drops the cb check
 
-  assert(in.Checkerboard()==Even);
+  GRID_ASSERT(in.Checkerboard()==Even);
   out.Checkerboard() = Odd;
 
   DhopInternal(StencilEven,UmuOdd,in,out,dag);
@@ -440,7 +440,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
   conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
   conformable(in.Grid(),out.Grid()); // drops the cb check
 
-  assert(in.Checkerboard()==Odd);
+  GRID_ASSERT(in.Checkerboard()==Odd);
   out.Checkerboard() = Even;
 
   DhopInternal(StencilOdd,UmuEven,in,out,dag);
diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
index 8c58f692..cba9b48c 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -233,7 +233,7 @@ template <class Impl>
 void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
                                         GaugeField &mat, const FermionField &A,
                                         const FermionField &B, int dag) {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
+  GRID_ASSERT((dag == DaggerNo) || (dag == DaggerYes));
 
   Compressor compressor(dag);
 
@@ -280,8 +280,8 @@ void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, co
   //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
   // Motivation: look at the SchurDiff operator
 
-  assert(V.Checkerboard() == Even);
-  assert(U.Checkerboard() == Odd);
+  GRID_ASSERT(V.Checkerboard() == Even);
+  GRID_ASSERT(U.Checkerboard() == Odd);
   mat.Checkerboard() = Odd;
 
   DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
@@ -294,8 +294,8 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
   conformable(U.Grid(), V.Grid());
   //conformable(U.Grid(), mat.Grid());
 
-  assert(V.Checkerboard() == Odd);
-  assert(U.Checkerboard() == Even);
+  GRID_ASSERT(V.Checkerboard() == Odd);
+  GRID_ASSERT(U.Checkerboard() == Even);
   mat.Checkerboard() = Even;
 
   DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
@@ -318,7 +318,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
-  assert(in.Checkerboard() == Even);
+  GRID_ASSERT(in.Checkerboard() == Even);
   out.Checkerboard() = Odd;
 
   DhopInternal(StencilEven, UmuOdd, in, out, dag);
@@ -330,7 +330,7 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
-  assert(in.Checkerboard() == Odd);
+  GRID_ASSERT(in.Checkerboard() == Odd);
   out.Checkerboard() = Even;
 
   DhopInternal(StencilOdd, UmuEven, in, out, dag);
@@ -365,7 +365,7 @@ void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<Fermion
   Compressor compressor(DaggerNo);
   Stencil.HaloExchange(in, compressor);
 
-  assert((out.size()==8)||(out.size()==9));
+  GRID_ASSERT((out.size()==8)||(out.size()==9));
   for(int dir=0;dir<Nd;dir++){
     for(int disp=-1;disp<=1;disp+=2){
 
@@ -406,7 +406,7 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
 						      FermionField &out, int dag)
 {
   GRID_TRACE("DhopOverlapped");
-  assert((dag == DaggerNo) || (dag == DaggerYes));
+  GRID_ASSERT((dag == DaggerNo) || (dag == DaggerYes));
 
   Compressor compressor(dag);
   int len =  U.Grid()->oSites();
@@ -475,7 +475,7 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st,
 					     FermionField &out, int dag)
 {
   GRID_TRACE("DhopSerial");
-  assert((dag == DaggerNo) || (dag == DaggerYes));
+  GRID_ASSERT((dag == DaggerNo) || (dag == DaggerYes));
   Compressor compressor(dag);
   {
     GRID_TRACE("HaloExchange");
diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
index 0c956f7e..ab1eba90 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
@@ -44,42 +44,42 @@ template<class Impl> void
 WilsonKernels<Impl >::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 				  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
index 54a76b07..4a230dfb 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -375,8 +375,8 @@ template <class Impl>
 void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma)
 {
-  assert(dirdisp<=7);
-  assert(dirdisp>=0);
+  GRID_ASSERT(dirdisp<=7);
+  GRID_ASSERT(dirdisp>=0);
 
    autoView(U_v  ,U  ,AcceleratorRead);
    autoView(in_v ,in ,AcceleratorRead);
@@ -405,13 +405,13 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    LoopBody(Zm);
    LoopBody(Tm);
    default:
-     assert(0);
+     GRID_ASSERT(0);
      break;
    }
 #undef LoopBody
 }
 
-#ifdef GRID_SYCL
+#if 0
 extern "C" {
     ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
     uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );
@@ -525,7 +525,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
    }
-   assert(0 && " Kernel optimisation case not covered ");
+   GRID_ASSERT(0 && " Kernel optimisation case not covered ");
   }
 
 template <class Impl>
@@ -571,7 +571,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
    }
-   assert(0 && " Kernel optimisation case not covered ");
+   GRID_ASSERT(0 && " Kernel optimisation case not covered ");
   }
 
 #undef KERNEL_CALLNB
diff --git a/Grid/qcd/action/gauge/Gauge.h b/Grid/qcd/action/gauge/Gauge.h
index 7a72d087..2b6b0e6b 100644
--- a/Grid/qcd/action/gauge/Gauge.h
+++ b/Grid/qcd/action/gauge/Gauge.h
@@ -51,6 +51,9 @@ typedef IwasakiGaugeAction<PeriodicGimplD>         IwasakiGaugeActionD;
 typedef SymanzikGaugeAction<PeriodicGimplR>        SymanzikGaugeActionR;
 typedef SymanzikGaugeAction<PeriodicGimplF>        SymanzikGaugeActionF;
 typedef SymanzikGaugeAction<PeriodicGimplD>        SymanzikGaugeActionD;
+typedef DBW2GaugeAction<PeriodicGimplR>            DBW2GaugeActionR;
+typedef DBW2GaugeAction<PeriodicGimplF>            DBW2GaugeActionF;
+typedef DBW2GaugeAction<PeriodicGimplD>            DBW2GaugeActionD;
 
 
 typedef WilsonGaugeAction<ConjugateGimplR>          ConjugateWilsonGaugeActionR;
diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h
index a9af1fae..95b69290 100644
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@@ -138,10 +138,13 @@ public:
     //auto start = std::chrono::high_resolution_clock::now();
     autoView(U_v,U,AcceleratorWrite);
     autoView(P_v,P,AcceleratorRead);
-    accelerator_for(ss, P.Grid()->oSites(),1,{
+    typedef typename Field::vector_object vobj;
+    const int Nsimd = vobj::Nsimd();
+    accelerator_for(ss, P.Grid()->oSites(),Nsimd,{
       for (int mu = 0; mu < Nd; mu++) {
-          U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu);
-          U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu));
+	auto tmp = Exponentiate(P_v(ss)(mu), ep, Nexp) * U_v(ss)(mu);
+	tmp      = Group::ProjectOnGeneralGroup(tmp);
+	coalescedWrite(U_v[ss](mu),tmp);
       }
     });
    //auto end = std::chrono::high_resolution_clock::now();
@@ -176,6 +179,8 @@ public:
     Group::ColdConfiguration(pRNG, U);
   }
 
+  static const int num_colours = Group::Dimension;
+
 };
 
 
diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h
index 312e889c..62141c83 100644
--- a/Grid/qcd/action/gauge/GaugeImplementations.h
+++ b/Grid/qcd/action/gauge/GaugeImplementations.h
@@ -97,7 +97,7 @@ public:
   static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
                                             const Lattice<covariant> &field)
   {
-    assert(_conjDirs.size() == Nd);
+    GRID_ASSERT(_conjDirs.size() == Nd);
     if(_conjDirs[mu]) 
       return ConjugateBC::CovShiftForward(Link, mu, field);
     else
@@ -108,7 +108,7 @@ public:
   static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
                                              const Lattice<covariant> &field)
   {
-    assert(_conjDirs.size() == Nd);
+    GRID_ASSERT(_conjDirs.size() == Nd);
     if(_conjDirs[mu]) 
       return ConjugateBC::CovShiftBackward(Link, mu, field);
     else 
@@ -123,7 +123,7 @@ public:
   static inline GaugeLinkField
   CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
   {
-    assert(_conjDirs.size() == Nd);
+    GRID_ASSERT(_conjDirs.size() == Nd);
     if(_conjDirs[mu]) 
       return ConjugateBC::CovShiftIdentityBackward(Link, mu);
     else 
@@ -132,7 +132,7 @@ public:
   static inline GaugeLinkField
   CovShiftIdentityForward(const GaugeLinkField &Link, int mu)
   {
-    assert(_conjDirs.size() == Nd);
+    GRID_ASSERT(_conjDirs.size() == Nd);
     if(_conjDirs[mu]) 
       return ConjugateBC::CovShiftIdentityForward(Link,mu);
     else
@@ -148,7 +148,7 @@ public:
   //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
   static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
   {
-    assert(_conjDirs.size() == Nd);
+    GRID_ASSERT(_conjDirs.size() == Nd);
     if(_conjDirs[mu]) 
       return ConjugateBC::ShiftStaple(Link,mu);
     else     
@@ -169,7 +169,7 @@ public:
   //shift = -1
   //Out(x) = U_\mu(x-\hat\mu mod L)
   static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
-    assert(_conjDirs.size() == Nd);
+    GRID_ASSERT(_conjDirs.size() == Nd);
     if(_conjDirs[mu]) 
       return ConjugateBC::CshiftLink(Link,mu,shift);
     else     
diff --git a/Grid/qcd/action/gauge/Photon.h b/Grid/qcd/action/gauge/Photon.h
index 3d4baccd..34107fcd 100644
--- a/Grid/qcd/action/gauge/Photon.h
+++ b/Grid/qcd/action/gauge/Photon.h
@@ -193,7 +193,7 @@ NAMESPACE_BEGIN(Grid);
         break;
       }
       default:
-        assert(0);
+        GRID_ASSERT(0);
         break;
     }
   }
@@ -246,10 +246,10 @@ NAMESPACE_BEGIN(Grid);
         transverseProjectSpatial(out);
         break;
       case Gauge::landau:
-        assert(0);
+        GRID_ASSERT(0);
         break;
       default:
-        assert(0);
+        GRID_ASSERT(0);
         break;
     }
   }
diff --git a/Grid/qcd/action/pseudofermion/Bounds.h b/Grid/qcd/action/pseudofermion/Bounds.h
index 8864b1d7..fc22fa57 100644
--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@@ -11,7 +11,7 @@ NAMESPACE_BEGIN(Grid);
       PowerMethod<Field> power_method;
       auto lambda_max = power_method(HermOp,Phi);
       std::cout << GridLogMessage << "Pseudofermion action lamda_max "<<lambda_max<<"( bound "<<hi<<")"<<std::endl;
-      assert( (lambda_max < hi) && " High Bounds Check on operator failed" );
+      GRID_ASSERT( (lambda_max < hi) && " High Bounds Check on operator failed" );
     }
 
      template<class Field> void ChebyBoundsCheck(LinearOperatorBase<Field> &HermOp,
@@ -36,7 +36,7 @@ NAMESPACE_BEGIN(Grid);
       std::cout << " Cheb x noise             = "<<Nz<<std::endl;
       std::cout << " Ratio                    = "<<Nz/Nx<<std::endl;
       std::cout << "************************* "<<std::endl;
-      assert( ((Nz/Nx)<1.0) && " ChebyBoundsCheck ");
+      GRID_ASSERT( ((Nz/Nx)<1.0) && " ChebyBoundsCheck ");
     }
       
     template<class Field> void InverseSqrtBoundsCheck(int MaxIter,double tol,
@@ -71,7 +71,7 @@ NAMESPACE_BEGIN(Grid);
       std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise |^2  = "<<Nd<<std::endl;
       std::cout << " | noise - MdagM (MdagM^-1/2)^2  noise|/|noise| = " << std::sqrt(Nd/Nx) << std::endl;
       std::cout << "************************* "<<std::endl;
-      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
+      GRID_ASSERT( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
     }
 
     /* For a HermOp = M^dag M, check the approximation of  HermOp^{-1/inv_pow}
@@ -122,7 +122,7 @@ NAMESPACE_BEGIN(Grid);
       std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |^2  = "<<Nd<<std::endl;
       std::cout << " | noise - MdagM (MdagM^-1/" << inv_pow << ")^" << inv_pow << " noise |/| noise |  = "<<std::sqrt(Nd/Nx)<<std::endl;
       std::cout << "************************* "<<std::endl;
-      assert( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
+      GRID_ASSERT( (std::sqrt(Nd/Nx)<tol) && " InversePowerBoundsCheck ");
     }
 
 NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h b/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
index 0d74a560..29408ea2 100644
--- a/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
+++ b/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
@@ -61,8 +61,8 @@ public:
     conformable(fcbgrid,V.Grid());
 
     // Assert the checkerboard?? or code for either
-    assert(U.Checkerboard()==Odd);
-    assert(V.Checkerboard()==U.Checkerboard());
+    GRID_ASSERT(U.Checkerboard()==Odd);
+    GRID_ASSERT(V.Checkerboard()==U.Checkerboard());
 
     // NOTE Guido: WE DO NOT WANT TO USE THE ucbgrid GRID FOR THE FORCE
     // it is not conformable with the HMC force field
@@ -83,8 +83,8 @@ public:
     this->_Mat.MooeeInvDag(tmp1,tmp2); // even->even 
     this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerNo);
           
-    assert(ForceE.Checkerboard()==Even);
-    assert(ForceO.Checkerboard()==Odd);
+    GRID_ASSERT(ForceE.Checkerboard()==Even);
+    GRID_ASSERT(ForceO.Checkerboard()==Odd);
 
     setCheckerboard(Force,ForceE); 
     setCheckerboard(Force,ForceO);
@@ -106,8 +106,8 @@ public:
     conformable(fcbgrid,V.Grid());
 
     // Assert the checkerboard?? or code for either
-    assert(V.Checkerboard()==Odd);
-    assert(V.Checkerboard()==V.Checkerboard());
+    GRID_ASSERT(V.Checkerboard()==Odd);
+    GRID_ASSERT(V.Checkerboard()==V.Checkerboard());
 
     // NOTE Guido: WE DO NOT WANT TO USE THE ucbgrid GRID FOR THE FORCE
     // it is not conformable with the HMC force field
@@ -127,8 +127,8 @@ public:
     this->_Mat.MooeeInv(tmp1,tmp2); // even->even 
     this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerYes);
 
-    assert(ForceE.Checkerboard()==Even);
-    assert(ForceO.Checkerboard()==Odd);
+    GRID_ASSERT(ForceE.Checkerboard()==Even);
+    GRID_ASSERT(ForceO.Checkerboard()==Odd);
 
     setCheckerboard(Force,ForceE); 
     setCheckerboard(Force,ForceO);
diff --git a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
index 9d4df1d3..5c7e5a0d 100644
--- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@@ -400,7 +400,7 @@ NAMESPACE_BEGIN(Grid);
 	  std::cout << GridLogMessage << action_name() << " initial action " << action << " expect " << norm2_eta << "; diff " << diff << std::endl;
 	  std::cout << GridLogMessage << action_name() << "[ eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta ]/|eta^2| = " << test << "  expect 0 (tol " << param.BoundsCheckTol << ")" << std::endl;
 
-	  assert( ( test < param.BoundsCheckTol ) && " Initial action check failed" );
+	  GRID_ASSERT( ( test < param.BoundsCheckTol ) && " Initial action check failed" );
 	  initial_action = false;
 	}
 
diff --git a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
index 656e9b2f..ee0110db 100644
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@@ -135,7 +135,7 @@ public:
     // FIXME : Clover term not yet..
     //////////////////////////////////////////////////////
 
-    assert(FermOp.ConstEE() == 1);
+    GRID_ASSERT(FermOp.ConstEE() == 1);
     PhiEven = Zero();
   };
 
diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
index 2e5208a8..78e7fff4 100644
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
@@ -168,7 +168,7 @@ public:
     //	FermOp.MeeDeriv(tmp , Y, X,DaggerNo );    dSdU=tmp;
     //  FermOp.MeeDeriv(tmp , X, Y,DaggerYes);  dSdU=dSdU+tmp;
 
-    assert(FermOp.ConstEE() == 1);
+    GRID_ASSERT(FermOp.ConstEE() == 1);
 
     /*
       FermOp.MooeeInvDag(PhiOdd,Y);
diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
index ff9a6496..6c9cba6e 100644
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -231,8 +231,8 @@ NAMESPACE_BEGIN(Grid);
 
         // FIXME No force contribution from EvenEven assumed here
         // Needs a fix for clover.
-        assert(NumOp.ConstEE() == 1);
-        assert(DenOp.ConstEE() == 1);
+        GRID_ASSERT(NumOp.ConstEE() == 1);
+        GRID_ASSERT(DenOp.ConstEE() == 1);
 
         dSdU = -dSdU;
         
diff --git a/Grid/qcd/action/scalar/ScalarInteractionAction.h b/Grid/qcd/action/scalar/ScalarInteractionAction.h
index 7708a489..e88f1bcd 100644
--- a/Grid/qcd/action/scalar/ScalarInteractionAction.h
+++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h
@@ -81,7 +81,7 @@ public:
 
   virtual RealD S(const Field &p)
   {
-    assert(p.Grid()->Nd() == Ndim);
+    GRID_ASSERT(p.Grid()->Nd() == Ndim);
     static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements);
     phiStencil.HaloExchange(p, compressor);
     Field action(p.Grid()), pshift(p.Grid()), phisquared(p.Grid());
@@ -128,7 +128,7 @@ public:
   virtual void deriv(const Field &p, Field &force)
   {
     double t0 = usecond();
-    assert(p.Grid()->Nd() == Ndim);
+    GRID_ASSERT(p.Grid()->Nd() == Ndim);
     force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
     double interm_t = usecond();
 
diff --git a/Grid/qcd/gparity/GparityFlavour.h b/Grid/qcd/gparity/GparityFlavour.h
index b2009235..1b4f38db 100644
--- a/Grid/qcd/gparity/GparityFlavour.h
+++ b/Grid/qcd/gparity/GparityFlavour.h
@@ -388,7 +388,7 @@ accelerator_inline auto operator*(const GparityFlavour &G, const iVector<vtype,
     multFlavourProjMinus(ret, arg); break;
   case GparityFlavour::Algebra::MinusProjMinus:
     multFlavourMinusProjMinus(ret, arg); break;
-  default: assert(0);
+  default: GRID_ASSERT(0);
   }
  
   return ret;
@@ -426,7 +426,7 @@ accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix<vtype,
     lmultFlavourProjMinus(ret, arg); break;
   case GparityFlavour::Algebra::MinusProjMinus:
     lmultFlavourMinusProjMinus(ret, arg); break;  
-  default: assert(0);
+  default: GRID_ASSERT(0);
   }
   
   return ret;
@@ -464,7 +464,7 @@ accelerator_inline auto operator*(const iMatrix<vtype, Ngp> &arg, const GparityF
     rmultFlavourProjMinus(ret, arg); break;
   case GparityFlavour::Algebra::MinusProjMinus:
     rmultFlavourMinusProjMinus(ret, arg); break;
-  default: assert(0);
+  default: GRID_ASSERT(0);
   }
 
   return ret;
diff --git a/Grid/qcd/hmc/HMCResourceManager.h b/Grid/qcd/hmc/HMCResourceManager.h
index 19bee923..f34fb98e 100644
--- a/Grid/qcd/hmc/HMCResourceManager.h
+++ b/Grid/qcd/hmc/HMCResourceManager.h
@@ -216,7 +216,7 @@ public:
   }
 
   void SetMomentumFilter( MomentumFilterBase<typename ImplementationPolicy::Field> * MomFilter) {
-    assert(have_Filter==false);
+    GRID_ASSERT(have_Filter==false);
     Filter = std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> >(MomFilter);
     have_Filter = true;
   }
@@ -250,7 +250,7 @@ public:
   void AddRNGs(std::string s = "") {
     // Couple the RNGs to the GridModule tagged by s
     // the default is the first grid registered
-    assert(Grids.size() > 0 && !have_RNG);
+    GRID_ASSERT(Grids.size() > 0 && !have_RNG);
     if (s.empty()) s = Grids.begin()->first;
     std::cout << GridLogDebug << "Adding RNG to grid: " << s << std::endl;
     RNGs.set_pRNG(new GridParallelRNG(GetCartesian(s)));
@@ -262,12 +262,12 @@ public:
   GridSerialRNG& GetSerialRNG() { return RNGs.get_sRNG(); }
 
   GridParallelRNG& GetParallelRNG() {
-    assert(have_RNG);
+    GRID_ASSERT(have_RNG);
     return RNGs.get_pRNG();
   }
 
   void SeedFixedIntegers() {
-    assert(have_RNG);
+    GRID_ASSERT(have_RNG);
     RNGs.seed();
   }
 
diff --git a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
index 72f5e39b..4169ce38 100644
--- a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
@@ -98,7 +98,7 @@ public:
   virtual void TrajectoryComplete(int traj,
                                   typename Impl::Field &U,
                                   GridSerialRNG &sRNG,
-                                  GridParallelRNG &pRNG) { assert(0); } ; // HMC should pass the smart config with smeared and unsmeared
+                                  GridParallelRNG &pRNG) { GRID_ASSERT(0); } ; // HMC should pass the smart config with smeared and unsmeared
   
   virtual void CheckpointRestore(int traj, typename Impl::Field &U,
                                  GridSerialRNG &sRNG,
diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h
index 549920a0..f497f749 100644
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -126,7 +126,7 @@ public:
     // input U actually not used in the fundamental case
     // Fundamental updates, include smearing
 
-    assert(as.size()==LevelForces.size());
+    GRID_ASSERT(as.size()==LevelForces.size());
     
     Field level_force(U.Grid()); level_force =Zero();
     for (int a = 0; a < as[level].actions.size(); ++a) {
@@ -267,13 +267,13 @@ public:
 
   void reset_timer(void)
   {
-    assert(as.size()==LevelForces.size());
+    GRID_ASSERT(as.size()==LevelForces.size());
     for (int level = 0; level < as.size(); ++level) {
       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
         as[level].actions.at(actionID)->reset_timer();
       }
       int actionID=0;
-      assert(LevelForces.at(level).actions.size()==1);
+      GRID_ASSERT(LevelForces.at(level).actions.size()==1);
       LevelForces.at(level).actions.at(actionID)->reset_timer();
     }
   }
@@ -398,7 +398,7 @@ public:
   // Initialization of momenta and actions
   void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG) 
   {
-    assert(P.Grid() == U.Grid());
+    GRID_ASSERT(P.Grid() == U.Grid());
     std::cout << GridLogIntegrator << "Integrator refresh" << std::endl;
 
     std::cout << GridLogIntegrator << "Generating momentum" << std::endl;
@@ -455,7 +455,7 @@ public:
   RealD S(Field& U) 
   {  // here also U not used
 
-    assert(as.size()==LevelForces.size());
+    GRID_ASSERT(as.size()==LevelForces.size());
     std::cout << GridLogIntegrator << "Integrator action\n";
 
     RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
@@ -545,14 +545,14 @@ public:
 
     // Check the clocks all match on all levels
     for (int level = 0; level < as.size(); ++level) {
-      assert(fabs(t_U - t_P[level]) < 1.0e-6);  // must be the same
+      GRID_ASSERT(fabs(t_U - t_P[level]) < 1.0e-6);  // must be the same
       std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
     }
 
     FieldImplementation::Project(U);
 
     // and that we indeed got to the end of the trajectory
-    assert(fabs(t_U - Params.trajL) < 1.0e-6);
+    GRID_ASSERT(fabs(t_U - Params.trajL) < 1.0e-6);
 
   }
 
diff --git a/Grid/qcd/modules/ObservableModules.h b/Grid/qcd/modules/ObservableModules.h
index 87fcbb92..cda86185 100644
--- a/Grid/qcd/modules/ObservableModules.h
+++ b/Grid/qcd/modules/ObservableModules.h
@@ -103,6 +103,18 @@ class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{
   PolyakovMod(): ObsBase(NoParameters()){}
 };
 
+template < class Impl >
+class SpatialPolyakovMod: public ObservableModule<SpatialPolyakovLogger<Impl>, NoParameters>{
+  typedef ObservableModule<SpatialPolyakovLogger<Impl>, NoParameters> ObsBase;
+  using ObsBase::ObsBase; // for constructors
+
+  // acquire resource
+  virtual void initialize(){
+    this->ObservablePtr.reset(new SpatialPolyakovLogger<Impl>());
+  }
+  public:
+  SpatialPolyakovMod(): ObsBase(NoParameters()){}
+};
 
 template < class Impl >
 class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
diff --git a/Grid/qcd/observables/polyakov_loop.h b/Grid/qcd/observables/polyakov_loop.h
index 0b59f549..57812ff6 100644
--- a/Grid/qcd/observables/polyakov_loop.h
+++ b/Grid/qcd/observables/polyakov_loop.h
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid
 
-Source file: ./lib/qcd/modules/polyakov_line.h
+Source file: ./Grid/qcd/observables/polyakov_loop.h
 
-Copyright (C) 2017
+Copyright (C) 2025
 
 Author: David Preti <david.preti@csic.es>
+Author: Alexis Verney-Provatas <2414441@swansea.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -60,4 +61,43 @@ class PolyakovLogger : public HmcObservable<typename Impl::Field> {
   }
 };
 
+template <class Impl>
+class SpatialPolyakovLogger : public HmcObservable<typename Impl::Field> {
+ public:
+    // here forces the Impl to be of gauge fields
+    // if not the compiler will complain
+    INHERIT_GIMPL_TYPES(Impl);
+
+     // necessary for HmcObservable compatibility
+    typedef typename Impl::Field Field;
+
+    void TrajectoryComplete(int traj,
+                            Field &U,
+                            GridSerialRNG &sRNG,
+                            GridParallelRNG &pRNG) {
+
+    // Save current numerical output precision
+    int def_prec = std::cout.precision();
+
+    // Assume that the dimensions are D=3+1
+    int Ndim = 3;
+    ComplexD polyakov;
+   
+    // Iterate over the spatial directions and print the average spatial polyakov loop
+    // over them 
+    for (int idx=0; idx<Ndim; idx++) {
+        polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U, idx);
+    
+        std::cout << GridLogMessage
+            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+            << "Polyakov Loop in the " << idx << " spatial direction : [ " << traj << " ] "<< polyakov << std::endl;
+
+    }
+
+    // Return to original output precision
+    std::cout.precision(def_prec);
+
+  }
+};
+
 NAMESPACE_END(Grid);
diff --git a/Grid/qcd/smearing/GaugeConfigurationMasked.h b/Grid/qcd/smearing/GaugeConfigurationMasked.h
index 3ee00d6c..73243be2 100644
--- a/Grid/qcd/smearing/GaugeConfigurationMasked.h
+++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h
@@ -825,6 +825,7 @@ public:
   virtual void fill_smearedSet(GaugeField &U)
   {
     this->ThinLinks = &U;  // attach the smearing routine to the field U
+    std::cout << GridLogMessage << " fill_smearedSet " << WilsonLoops<PeriodicGimplR>::avgPlaquette(U) << std::endl;
 
     // check the pointer is not null
     if (this->ThinLinks == NULL)
@@ -846,6 +847,8 @@ public:
 	ApplyMask(smeared_A,smearLvl);
 	smeared_B = previous_u;
 	ApplyMask(smeared_B,smearLvl);
+	std::cout << GridLogMessage << " smeared_A " << norm2(smeared_A) << std::endl;
+	std::cout << GridLogMessage << " smeared_B " << norm2(smeared_B) << std::endl;
 	// Replace only the masked portion
 	this->SmearedSet[smearLvl] = previous_u-smeared_B + smeared_A;
         previous_u = this->SmearedSet[smearLvl];
diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h
index e98e9b87..6ffc77ba 100644
--- a/Grid/qcd/smearing/HISQSmearing.h
+++ b/Grid/qcd/smearing/HISQSmearing.h
@@ -99,16 +99,16 @@ public:
     Smear_HISQ(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
         : _grid(grid), 
           _linkTreatment(c1,cnaik,c3,c5,c7,clp) {
-        assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
-        assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
+        GRID_ASSERT(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
+        GRID_ASSERT(Nd == 4 && "HISQ smearing only defined for Nd==4");
     }
 
     // Allow to pass a pointer to a C-style, double array for MILC convenience
     Smear_HISQ(GridCartesian* grid, double* coeff) 
         : _grid(grid), 
           _linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) {
-        assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
-        assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
+        GRID_ASSERT(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
+        GRID_ASSERT(Nd == 4 && "HISQ smearing only defined for Nd==4");
     }
 
     ~Smear_HISQ() {}
diff --git a/Grid/qcd/smearing/JacobianAction.h b/Grid/qcd/smearing/JacobianAction.h
index 6db59bec..bc691581 100644
--- a/Grid/qcd/smearing/JacobianAction.h
+++ b/Grid/qcd/smearing/JacobianAction.h
@@ -53,9 +53,9 @@ public:
   //////////////////////////////////
   // Usual cases are not used
   //////////////////////////////////
-  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){ assert(0);};
-  virtual RealD S(const GaugeField &U) { assert(0); }
-  virtual void deriv(const GaugeField &U, GaugeField &dSdU) { assert(0);  }
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){ GRID_ASSERT(0);};
+  virtual RealD S(const GaugeField &U) { GRID_ASSERT(0); return 0; }
+  virtual void deriv(const GaugeField &U, GaugeField &dSdU) { GRID_ASSERT(0);  }
 
   //////////////////////////////////
   // Functions of smart configs only
@@ -67,7 +67,7 @@ public:
   virtual RealD S(ConfigurationBase<GaugeField>& U)
   {
     // det M = e^{ - ( - logDetM) }
-    assert( &U == smearer );
+    GRID_ASSERT( &U == smearer );
     return -smearer->logDetJacobian();
   }
   virtual RealD Sinitial(ConfigurationBase<GaugeField>& U) 
@@ -76,7 +76,7 @@ public:
   }
   virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
   {
-    assert( &U == smearer );
+    GRID_ASSERT( &U == smearer );
     smearer->logDetJacobianForce(dSdU);
   }
 
diff --git a/Grid/qcd/smearing/StoutSmearing.h b/Grid/qcd/smearing/StoutSmearing.h
index 787ef104..5ae6fbe8 100644
--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@@ -63,20 +63,20 @@ public:
 
   /*! Stout smearing with base explicitly specified */
   Smear_Stout(Smear<Gimpl>* base) : SmearBase{base} {
-    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
+    GRID_ASSERT(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
   }
 
   /*! Construct stout smearing object from explicitly specified rho matrix */
   Smear_Stout(const std::vector<double>& rho_)
     : OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} {
     std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl;
-    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
+    GRID_ASSERT(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
     }
 
   /*! Default constructor. rho is constant in all directions, optionally except for orthogonal dimension */
   Smear_Stout(double rho = 1.0, int orthogdim = -1)
   : OrthogDim{orthogdim}, SmearRho{ rho3D(rho,orthogdim) }, OwnedBase{ new Smear_APE<Gimpl>(SmearRho) }, SmearBase{OwnedBase.get()} {
-    assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
+    GRID_ASSERT(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
   }
 
   ~Smear_Stout() {}  // delete SmearBase...
diff --git a/Grid/qcd/smearing/WilsonFlow.h b/Grid/qcd/smearing/WilsonFlow.h
index 188c6973..a89696ac 100644
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -37,21 +37,38 @@ class WilsonFlowBase: public Smear<Gimpl>{
 public:
   //Store generic measurements to take during smearing process using std::function
   typedef std::function<void(int, RealD, const typename Gimpl::GaugeField &)> FunctionType;  //int: step,  RealD: flow time,  GaugeField : the gauge field
+   
+  INHERIT_GIMPL_TYPES(Gimpl);
+
+  typedef Action<typename Gimpl::GaugeField> ActionBase;
 
 protected:
+
   std::vector< std::pair<int, FunctionType> > functions; //The int maps to the measurement frequency
 
-  mutable WilsonGaugeAction<Gimpl> SG;
-   
-public:
-  INHERIT_GIMPL_TYPES(Gimpl)
+  
+  ActionBase *SG;
 
-  explicit WilsonFlowBase(unsigned int meas_interval =1):
-    SG(WilsonGaugeAction<Gimpl>(3.0)) {
-    // WilsonGaugeAction with beta 3.0
+public:
+
+//Define the action used to evolve the plaquettes
+//(Lüscher: https://arxiv.org/pdf/1006.4518 eq. 1.4)
+//V'(t) = -g^2 * ( d/dVt S[Vt](g) ) * Vt
+//      = -g^2 * ( d/dVt (1/g^2 * sum_p Re tr{ 1 - Vt(p) } ) ) * Vt
+//      = - d/dVt ( sum_p ( Nc - Re tr Vt(p) ) * Vt
+//      = - d/dVt ( Nc * sum_p ( 1 - Re tr Vt(p)/Nc ) ) * Vt
+//      = - d/dVt SG[Vt](Nc) * Vt
+  explicit WilsonFlowBase(unsigned int meas_interval =1) {
+    
+    SG = (ActionBase *) new WilsonGaugeAction<Gimpl>(Gimpl::num_colours);
     setDefaultMeasurements(meas_interval);
   }
-    
+
+  void setGaugeAction(ActionBase *TheAction)
+  {
+    SG = TheAction;
+  }
+  
   void resetActions(){ functions.clear(); }
 
   void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); }
@@ -63,7 +80,7 @@ public:
   void setDefaultMeasurements(int topq_meas_interval = 1);
 
   void derivative(GaugeField&, const GaugeField&, const GaugeField&) const override{
-    assert(0);
+    GRID_ASSERT(0);
     // undefined for WilsonFlow
   }
 
@@ -138,9 +155,17 @@ public:
 ////////////////////////////////////////////////////////////////////////////////
 // Implementations
 ////////////////////////////////////////////////////////////////////////////////
+
+//Compute t^2 <E(t)> for time from the plaquette form
+//(Lüscher: https://arxiv.org/pdf/1006.4518 eq. 3.1)
+//E(t) = 2 * sum_p Retr{ 1 - Vt(p) } =
+//     = 2 * sum_p ( Nc - Retr Vt(p) ) =
+//     = 2 * Nc * sum_p ( 1 - Retr Vt(p)/Nc )
+//     = 2 * SG[Vt](Nc)
+//We divide by the volume to get an energy density per site, as is convention
 template <class Gimpl>
 RealD WilsonFlowBase<Gimpl>::energyDensityPlaquette(const RealD t, const GaugeField& U){
-  static WilsonGaugeAction<Gimpl> SG(3.0);
+  static WilsonGaugeAction<Gimpl> SG(Gimpl::num_colours);
   return 2.0 * t * t * SG.S(U)/U.Grid()->gSites();
 }
 
@@ -150,7 +175,7 @@ RealD WilsonFlowBase<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeF
   typedef typename Gimpl::GaugeLinkField GaugeMat;
   typedef typename Gimpl::GaugeField GaugeLorentz;
 
-  assert(Nd == 4);
+  GRID_ASSERT(Nd == 4);
   //E = 1/2 tr( F_munu F_munu )
   //However as  F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths:
   //F_01 F_02 F_03   F_12 F_13  F_23
@@ -225,17 +250,17 @@ template <class Gimpl>
 void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{
   GaugeField Z(U.Grid());
   GaugeField tmp(U.Grid());
-  this->SG.deriv(U, Z);
+  this->SG->deriv(U, Z);
   Z *= 0.25;                                  // Z0 = 1/4 * F(U)
   Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
 
   Z *= -17.0/8.0;
-  this->SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
+  this->SG->deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
   Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
   Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
 
   Z *= -4.0/3.0;
-  this->SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
+  this->SG->deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
   Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
   Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
   tau += epsilon;
@@ -285,20 +310,20 @@ int WilsonFlowAdaptive<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &
   Uprime = U;
   Usave = U;
 
-  this->SG.deriv(U, Z);
+  this->SG->deriv(U, Z);
   Zprime = -Z;
   Z *= 0.25;                                  // Z0 = 1/4 * F(U)
   Gimpl::update_field(Z, U, -2.0*eps);    // U = W1 = exp(ep*Z0)*W0
 
   Z *= -17.0/8.0;
-  this->SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
+  this->SG->deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
   Zprime += 2.0*tmp;
   Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
   Gimpl::update_field(Z, U, -2.0*eps);    // U_= W2 = exp(ep*Z)*W1
     
 
   Z *= -4.0/3.0;
-  this->SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
+  this->SG->deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
   Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
   Gimpl::update_field(Z, U, -2.0*eps);    // V(t+e) = exp(ep*Z)*W2
 
diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h
index 7089fd1b..0afb382d 100644
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@@ -62,16 +62,37 @@ public:
 			 const FermionField *rhs_vj,
 			 std::vector<Gamma::Algebra> gammas,
 			 const std::vector<ComplexField > &mom,
-			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
+			 int orthogdim);
+  template <typename TensorType> 
+  static void MesonField(TensorType &mat, 
+			 const FermionField *lhs_wi,
+			 const FermionField *rhs_vj,
+			 std::vector<Gamma::Algebra> gammas,
+			 const std::vector<ComplexField > &mom,
+			 int orthogdim,double *timer)
+  {
+    MesonField(mat,lhs_wi,rhs_vj,gammas,mom,orthogdim);
+  }
 
   template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
   static void AslashField(TensorType &mat, 
-        const FermionField *lhs_wi,
-        const FermionField *rhs_vj,
-        const std::vector<ComplexField> &emB0,
-        const std::vector<ComplexField> &emB1,
-        int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
+			  const FermionField *lhs_wi,
+			  const FermionField *rhs_vj,
+			  const std::vector<ComplexField> &emB0,
+			  const std::vector<ComplexField> &emB1,
+			  int orthogdim);
 
+  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  static void AslashField(TensorType &mat, 
+			  const FermionField *lhs_wi,
+			  const FermionField *rhs_vj,
+			  const std::vector<ComplexField> &emB0,
+			  const std::vector<ComplexField> &emB1,
+			  int orthogdim,double *timer)
+  {
+    AslashField(mat,lhs_wi,rhs_vj,emB0,emB1,orthogdim);
+  }
+  
   template <typename TensorType>
   typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
                            std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
@@ -136,7 +157,7 @@ typedef iVecComplex<vComplex >             vVecComplex;
 typedef Lattice<vVecComplex>               LatticeVecComplex;
 
 #define A2A_GPU_KERNELS
-#ifdef A2A_GPU_KERNELS
+
 template <class FImpl>
 template <typename TensorType>
 void A2Autils<FImpl>::MesonField(TensorType &mat, 
@@ -144,7 +165,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 				 const FermionField *rhs_vj,
 				 std::vector<Gamma::Algebra> gammas,
 				 const std::vector<ComplexField > &mom,
-				 int orthogdim, double *t_kernel, double *t_gsum) 
+				 int orthogdim) 
 {
   const int block=A2Ablocking;
   typedef typename FImpl::SiteSpinor vobj;
@@ -156,8 +177,8 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
   int Lblock = mat.dimension(3); 
   int Rblock = mat.dimension(4);
 
-  //  assert(Lblock % block==0);
-  //  assert(Rblock % block==0);
+  //  GRID_ASSERT(Lblock % block==0);
+  //  GRID_ASSERT(Rblock % block==0);
   
   GridBase *grid = lhs_wi[0].Grid();
   
@@ -170,19 +191,37 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 
   LatticeVecSpinMatrix SpinMat(grid);
   LatticeVecSpinMatrix MomSpinMat(grid);
+
+  std::cout <<GridLogMessage<< "A2A Meson Field"<<std::endl;
+  MomentumProject<LatticeVecSpinMatrix,ComplexField> MP;
+  MP.Allocate(Nmom,grid);
+  MP.ImportMomenta(mom);
+  std::cout <<GridLogMessage<< "Momentum project momenta imported"<<std::endl;
+
+  double t_view, t_gamma, t_kernel, t_momproj;
+  t_view=0;
+  t_gamma=0;
+  t_kernel=0;
+  t_momproj=0;
+  
   
   std::vector<VecSpinMatrix> sliced;
   for(int i=0;i<Lblock;i++){
+    t_view -= usecond();
     autoView(SpinMat_v,SpinMat,AcceleratorWrite);
     autoView(lhs_v,lhs_wi[i],AcceleratorRead);
+    t_view += usecond();
     for(int jo=0;jo<Rblock;jo+=block){
       for(int j=jo;j<MIN(Rblock,jo+block);j++){
 	int jj=j%block;
+	t_view -= usecond();
 	autoView(rhs_v,rhs_vj[j],AcceleratorRead); // Create a vector of views
+	t_view += usecond();
 	//////////////////////////////////////////
 	// Should write a SpinOuterColorTrace
 	//////////////////////////////////////////
 
+	t_kernel -= usecond();
 	accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
 	    auto left = conjugate(lhs_v(ss));
 	    auto right = rhs_v(ss);
@@ -195,28 +234,38 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	      }}
 	    coalescedWrite(SpinMat_v[ss],vv);
 	  });
+	t_kernel += usecond();
 
       }// j within block
       // After getting the sitewise product do the mom phase loop
-      for(int m=0;m<Nmom;m++){
 
-	MomSpinMat   = SpinMat * mom[m];
+      assert(orthogdim==Nd-1);
+      t_momproj -= usecond();
+      MP.Project(SpinMat,sliced);
+      t_momproj +=  usecond();
 
-	sliceSum(MomSpinMat,sliced,orthogdim);
-
-	for(int mu=0;mu<Ngamma;mu++){
-	  for(int t=0;t<sliced.size();t++){
-	    for(int j=jo;j<MIN(Rblock,jo+block);j++){
-	      int jj=j%block;
-	      auto tmp = peekIndex<LorentzIndex>(sliced[t],jj);
+      t_gamma -=  usecond();
+      thread_for2d( m, Nmom,t,Nt,{
+	  //      for(int m=0;m<Nmom;m++)
+	  //	for(int t=0;t<Nt;t++)
+	  int idx = t+m*Nt;
+	  for(int j=jo;j<MIN(Rblock,jo+block);j++){
+	    int jj=j%block;
+	    auto tmp = peekIndex<LorentzIndex>(sliced[idx],jj);
+	    for(int mu=0;mu<Ngamma;mu++){
 	      auto trSG = trace(tmp*Gamma(gammas[mu]));
-	      mat(m,mu,t,i,j) = trSG()();
+	      mat((long)m,mu,(long)t,i,j) = trSG()();
 	    }
 	  }
-	}
-      }
+      }); 
+      t_gamma +=  usecond();
     }//jo
   }
+  std::cout << GridLogMessage<<" A2A::MesonField t_view    "<<t_view/1e6<<"s"<<std::endl;
+  std::cout << GridLogMessage<<" A2A::MesonField t_momproj "<<t_momproj/1e6<<"s"<<std::endl;
+  std::cout << GridLogMessage<<" A2A::MesonField t_kernel  "<<t_kernel/1e6<<"s"<<std::endl;
+  std::cout << GridLogMessage<<" A2A::MesonField t_gamma   "<<t_gamma/1e6<<"s"<<std::endl;
+  
 }
 
 // "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
@@ -240,7 +289,7 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
 				  const FermionField *rhs_vj,
 				  const std::vector<ComplexField> &emB0,
 				  const std::vector<ComplexField> &emB1,
-				  int orthogdim, double *t_kernel, double *t_gsum) 
+				  int orthogdim) 
 {
   const int block=A2Ablocking;
   typedef typename FImpl::SiteSpinor vobj;
@@ -253,10 +302,10 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
   int Rblock = mat.dimension(4);
 
   int Nem = emB0.size();
-  assert(emB1.size() == Nem);
+  GRID_ASSERT(emB1.size() == Nem);
 
-  //  assert(Lblock % block==0);
-  //  assert(Rblock % block==0);
+  //  GRID_ASSERT(Lblock % block==0);
+  //  GRID_ASSERT(Rblock % block==0);
   
   GridBase *grid = lhs_wi[0].Grid();
   
@@ -327,354 +376,6 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
   }
 }
 
-#else
-template <class FImpl>
-template <typename TensorType>
-void A2Autils<FImpl>::MesonField(TensorType &mat, 
-				 const FermionField *lhs_wi,
-				 const FermionField *rhs_vj,
-				 std::vector<Gamma::Algebra> gammas,
-				 const std::vector<ComplexField > &mom,
-				 int orthogdim, double *t_kernel, double *t_gsum) 
-{
-  typedef typename FImpl::SiteSpinor vobj;
-
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  typedef iSpinMatrix<vector_type> SpinMatrix_v;
-  typedef iSpinMatrix<scalar_type> SpinMatrix_s;
-  
-  int Lblock = mat.dimension(3); 
-  int Rblock = mat.dimension(4);
-
-  GridBase *grid = lhs_wi[0].Grid();
-  
-  const int    Nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  int Nt     = grid->GlobalDimensions()[orthogdim];
-  int Ngamma = gammas.size();
-  int Nmom   = mom.size();
-
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
-
-  // will locally sum vectors first
-  // sum across these down to scalars
-  // splitting the SIMD
-  int MFrvol = rd*Lblock*Rblock*Nmom;
-  int MFlvol = ld*Lblock*Rblock*Nmom;
-
-  std::vector<SpinMatrix_v > lvSum(MFrvol);
-  for(int r=0;r<MFrvol;r++){
-    lvSum[r] = Zero();
-  }
-
-  std::vector<SpinMatrix_s > lsSum(MFlvol);             
-  for(int r=0;r<MFlvol;r++){
-    lsSum[r]=scalar_type(0.0);
-  }
-
-  int e1=    grid->_slice_nblock[orthogdim];
-  int e2=    grid->_slice_block [orthogdim];
-  int stride=grid->_slice_stride[orthogdim];
-
-  // potentially wasting cores here if local time extent too small
-  if (t_kernel) *t_kernel = -usecond();
-  for(int r=0;r<rd;r++) {
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-
-	int ss= so+n*stride+b;
-
-	for(int i=0;i<Lblock;i++){
-
-	  // Recreate view potentially expensive outside fo UVM mode
-	  autoView(lhs_v,lhs_wi[i],CpuRead);
-	  auto left = conjugate(lhs_v[ss]);
-	  for(int j=0;j<Rblock;j++){
-
-	    SpinMatrix_v vv;
-	    // Recreate view potentially expensive outside fo UVM mode
-	    autoView(rhs_v,rhs_vj[j],CpuRead);
-	    auto right = rhs_v[ss];
-	    for(int s1=0;s1<Ns;s1++){
-	    for(int s2=0;s2<Ns;s2++){
-	      vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
-		+             left()(s2)(1) * right()(s1)(1)
-		+             left()(s2)(2) * right()(s1)(2);
-	    }}
-	    
-	    // After getting the sitewise product do the mom phase loop
-	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
-	    for ( int m=0;m<Nmom;m++){
-	      int idx = m+base;
-	      autoView(mom_v,mom[m],CpuRead);
-	      auto phase = mom_v[ss];
-	      mac(&lvSum[idx],&vv,&phase);
-	    }
-	  }
-	}
-      }
-    }
-  };
-
-  // Sum across simd lanes in the plane, breaking out orthog dir.
-  for(int rt=0;rt<rd;rt++){
-
-    Coordinate icoor(Nd);
-    ExtractBuffer<SpinMatrix_s> extracted(Nsimd);               
-
-    for(int i=0;i<Lblock;i++){
-    for(int j=0;j<Rblock;j++){
-    for(int m=0;m<Nmom;m++){
-
-      int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
-
-      extract(lvSum[ij_rdx],extracted);
-
-      for(int idx=0;idx<Nsimd;idx++){
-
-	grid->iCoorFromIindex(icoor,idx);
-
-	int ldx    = rt+icoor[orthogdim]*rd;
-
-	int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
-
-	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
-
-      }
-    }}}
-  }
-  if (t_kernel) *t_kernel += usecond();
-  assert(mat.dimension(0) == Nmom);
-  assert(mat.dimension(1) == Ngamma);
-  assert(mat.dimension(2) == Nt);
-
-  // ld loop and local only??
-  int pd = grid->_processors[orthogdim];
-  int pc = grid->_processor_coor[orthogdim];
-  thread_for_collapse(2,lt,ld,{
-    for(int pt=0;pt<pd;pt++){
-      int t = lt + pt*ld;
-      if (pt == pc){
-	for(int i=0;i<Lblock;i++){
-	  for(int j=0;j<Rblock;j++){
-	    for(int m=0;m<Nmom;m++){
-	      int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
-	      for(int mu=0;mu<Ngamma;mu++){
-		// this is a bit slow
-		mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gammas[mu]))()()();
-	      }
-	    }
-	  }
-	}
-      } else { 
-	const scalar_type zz(0.0);
-	for(int i=0;i<Lblock;i++){
-	  for(int j=0;j<Rblock;j++){
-	    for(int mu=0;mu<Ngamma;mu++){
-	      for(int m=0;m<Nmom;m++){
-		mat(m,mu,t,i,j) =zz;
-	      }
-	    }
-	  }
-	}
-      }
-    }
-  });
-
-  ////////////////////////////////////////////////////////////////////
-  // This global sum is taking as much as 50% of time on 16 nodes
-  // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
-  // Healthy size that should suffice
-  ////////////////////////////////////////////////////////////////////
-  if (t_gsum) *t_gsum = -usecond();
-  grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
-  if (t_gsum) *t_gsum += usecond();
-}
-
-template <class FImpl>
-template <typename TensorType>
-void A2Autils<FImpl>::AslashField(TensorType &mat, 
-				  const FermionField *lhs_wi,
-				  const FermionField *rhs_vj,
-				  const std::vector<ComplexField> &emB0,
-				  const std::vector<ComplexField> &emB1,
-				  int orthogdim, double *t_kernel, double *t_gsum) 
-{
-  typedef typename FermionField::vector_object vobj;
-  typedef typename vobj::scalar_object         sobj;
-  typedef typename vobj::scalar_type           scalar_type;
-  typedef typename vobj::vector_type           vector_type;
-
-  typedef iSpinMatrix<vector_type> SpinMatrix_v;
-  typedef iSpinMatrix<scalar_type> SpinMatrix_s;
-  typedef iSinglet<vector_type>    Singlet_v;
-  typedef iSinglet<scalar_type>    Singlet_s;
-    
-  int Lblock = mat.dimension(3); 
-  int Rblock = mat.dimension(4);
-  
-  GridBase *grid = lhs_wi[0].Grid();
-  
-  const int    Nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  int Nt  = grid->GlobalDimensions()[orthogdim];
-  int Nem = emB0.size();
-  assert(emB1.size() == Nem);
-
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
-  
-    // will locally sum vectors first
-    // sum across these down to scalars
-    // splitting the SIMD
-    int MFrvol = rd*Lblock*Rblock*Nem;
-    int MFlvol = ld*Lblock*Rblock*Nem;
-
-    std::vector<vector_type> lvSum(MFrvol);
-    thread_for(r,MFrvol,
-    {
-      lvSum[r] = Zero();
-    });
-
-    std::vector<scalar_type> lsSum(MFlvol);             
-    thread_for(r,MFlvol,
-    {
-        lsSum[r] = scalar_type(0.0);
-    });
-
-    int e1=    grid->_slice_nblock[orthogdim];
-    int e2=    grid->_slice_block [orthogdim];
-    int stride=grid->_slice_stride[orthogdim];
-
-    // Nested parallelism would be ok
-    // Wasting cores here. Test case r
-    if (t_kernel) *t_kernel = -usecond();
-    for(int r=0;r<rd;r++)
-    {
-        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-        for(int n=0;n<e1;n++)
-        for(int b=0;b<e2;b++)
-        {
-            int ss= so+n*stride+b;
-
-            for(int i=0;i<Lblock;i++)
-            {
-  	        autoView(wi_v,lhs_wi[i],CpuRead);
-                auto left = conjugate(wi_v[ss]);
-
-                for(int j=0;j<Rblock;j++)
-                {
-                    SpinMatrix_v vv;
-		    autoView(vj_v,rhs_vj[j],CpuRead);
-                    auto right = vj_v[ss];
-
-                    for(int s1=0;s1<Ns;s1++)
-                    for(int s2=0;s2<Ns;s2++)
-                    {
-		          vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
-                                        + left()(s2)(1) * right()(s1)(1)
-                                        + left()(s2)(2) * right()(s1)(2);
-                    }
-
-		    // After getting the sitewise product do the mom phase loop
-                    int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r;
-
-                    for ( int m=0;m<Nem;m++)
-                    {
-  		        autoView(emB0_v,emB0[m],CpuRead);
-		        autoView(emB1_v,emB1[m],CpuRead);
-                        int idx  = m+base;
-                        auto b0  = emB0_v[ss];
-                        auto b1  = emB1_v[ss];
-                        auto cb0 = conjugate(b0);
-                        auto cb1 = conjugate(b1);
-
-                        lvSum[idx] += - vv()(3,0)()*b0()()()  - vv()(2,0)()*cb1()()()
-                                      + vv()(3,1)()*b1()()()  - vv()(2,1)()*cb0()()()
-                                      + vv()(0,2)()*b1()()()  + vv()(1,2)()*b0()()()
-                                      + vv()(0,3)()*cb0()()() - vv()(1,3)()*cb1()()();
-                    }
-                }
-            }
-        }
-    }
-
-    // Sum across simd lanes in the plane, breaking out orthog dir.
-    thread_for(rt,rd,
-    {
-        Coordinate icoor(Nd);
-        ExtractBuffer<scalar_type> extracted(Nsimd);               
-
-        for(int i=0;i<Lblock;i++)
-        for(int j=0;j<Rblock;j++)
-        for(int m=0;m<Nem;m++)
-        {
-
-            int ij_rdx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*rt;
-
-            extract<vector_type,scalar_type>(lvSum[ij_rdx],extracted);
-            for(int idx=0;idx<Nsimd;idx++)
-            {
-                grid->iCoorFromIindex(icoor,idx);
-
-                int ldx    = rt+icoor[orthogdim]*rd;
-                int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx;
-
-                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
-            }
-        }
-    });
-    if (t_kernel) *t_kernel += usecond();
-
-    // ld loop and local only??
-    int pd = grid->_processors[orthogdim];
-    int pc = grid->_processor_coor[orthogdim];
-    thread_for_collapse(2,lt,ld,
-    {
-        for(int pt=0;pt<pd;pt++)
-        {
-            int t = lt + pt*ld;
-            if (pt == pc)
-            {
-                for(int i=0;i<Lblock;i++)
-                for(int j=0;j<Rblock;j++)
-                for(int m=0;m<Nem;m++)
-                {
-                    int ij_dx = m+Nem*i + Nem*Lblock * j + Nem*Lblock * Rblock * lt;
-
-                    mat(m,0,t,i,j) = lsSum[ij_dx];
-                }
-            } 
-            else 
-            { 
-                const scalar_type zz(0.0);
-
-                for(int i=0;i<Lblock;i++)
-                for(int j=0;j<Rblock;j++)
-                for(int m=0;m<Nem;m++)
-                {
-                    mat(m,0,t,i,j) = zz;
-                }
-            }
-        }
-    });
-    if (t_gsum) *t_gsum = -usecond();
-    grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock);
-    if (t_gsum) *t_gsum += usecond();
-}
-#endif
 ////////////////////////////////////////////
 // Schematic thoughts about more generalised four quark insertion
 //
@@ -956,7 +657,7 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV
 						      ComplexField &O_trtr,
 						      ComplexField &O_fig8)
 {
-  assert(gamma0.size()==gamma1.size());
+  GRID_ASSERT(gamma0.size()==gamma1.size());
   int Ng = gamma0.size();
 
   GridBase *grid = WWVV0.Grid();
@@ -1000,7 +701,7 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
 						 ComplexField &O_trtr,
 						 ComplexField &O_fig8)
 {
-  assert(gamma0.size()==gamma1.size());
+  GRID_ASSERT(gamma0.size()==gamma1.size());
   int Ng = gamma0.size();
 
   GridBase *grid = WWVV0.Grid();
@@ -1119,7 +820,7 @@ void A2Autils<FImpl>::DeltaFeq2(int dt_min,int dt_max,
   int N_s = WW_sd.dimension(1); 
   int N_d = WW_sd.dimension(2);
 
-  assert(grid->GlobalDimensions()[orthogdim] == N_t);
+  GRID_ASSERT(grid->GlobalDimensions()[orthogdim] == N_t);
   double vol         = 1.0;
   for(int dim=0;dim<nd;dim++){
     vol = vol * grid->GlobalDimensions()[dim];
@@ -1472,9 +1173,9 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
     }}}
   }
   if (t_kernel) *t_kernel += usecond();
-  assert(mat.dimension(0) == Nmom);
-  assert(mat.dimension(1) == Ngamma);
-  assert(mat.dimension(2) == Nt);
+  GRID_ASSERT(mat.dimension(0) == Nmom);
+  GRID_ASSERT(mat.dimension(1) == Ngamma);
+  GRID_ASSERT(mat.dimension(2) == Nt);
 
   // ld loop and local only??
   int pd = grid->_processors[orthogdim];
@@ -1637,7 +1338,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
     }}
   });
 
-  assert(mat.dimension(0) == Nt);
+  GRID_ASSERT(mat.dimension(0) == Nt);
   // ld loop and local only??
   int pd = grid->_processors[orthogdim];
   int pc = grid->_processor_coor[orthogdim];
@@ -1785,8 +1486,8 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
     }}}
   });
 
-  assert(mat.dimension(0) == Nmom);
-  assert(mat.dimension(1) == Nt);
+  GRID_ASSERT(mat.dimension(0) == Nmom);
+  GRID_ASSERT(mat.dimension(1) == Nt);
  
   int pd = grid->_processors[orthogdim];
   int pc = grid->_processor_coor[orthogdim];
diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index 9a1d312b..a89d77be 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -508,7 +508,7 @@ void BaryonUtils<FImpl>::BaryonSiteMatrix(const mobj &D1,
  * The array wick_contractions must be of length 6               */
 template<class FImpl>
 void BaryonUtils<FImpl>::WickContractions(std::string qi, std::string qf, int &wick_contractions) {
-    assert(qi.size() == 3 && qf.size() == 3 && "Only sets of 3 quarks accepted.");
+    GRID_ASSERT(qi.size() == 3 && qf.size() == 3 && "Only sets of 3 quarks accepted.");
     const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
     wick_contractions=0;
     for (int ie=0; ie < 6 ; ie++) {
@@ -536,10 +536,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
              ComplexField &baryon_corr)
 {
 
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+  GRID_ASSERT(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  GRID_ASSERT(Nc==3 && "Baryon code only implemented for N_colour = 3");
  
-  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+  GRID_ASSERT(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
   GridBase *grid = q1_left.Grid();
   
@@ -587,8 +587,8 @@ void BaryonUtils<FImpl>::ContractBaryonsMatrix(const PropagatorField &q1_left,
              SpinMatrixField &baryon_corr)
 {
 
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+  GRID_ASSERT(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  GRID_ASSERT(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
   GridBase *grid = q1_left.Grid();
 
@@ -628,10 +628,10 @@ void BaryonUtils<FImpl>::ContractBaryonsSliced(const mobj &D1,
              robj &result)
 {
 
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+  GRID_ASSERT(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  GRID_ASSERT(Nc==3 && "Baryon code only implemented for N_colour = 3");
  
-  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+  GRID_ASSERT(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
   for (int t=0; t<nt; t++) {
     BaryonSite(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
@@ -652,8 +652,8 @@ void BaryonUtils<FImpl>::ContractBaryonsSlicedMatrix(const mobj &D1,
              robj &result)
 {
 
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+  GRID_ASSERT(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  GRID_ASSERT(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
   for (int t=0; t<nt; t++) {
     BaryonSiteMatrix(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result[t]);
@@ -962,8 +962,8 @@ void BaryonUtils<FImpl>::BaryonGamma3pt(
                         const Gamma GammaBf,
                         SpinMatrixField &stn_corr)
 {
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+  GRID_ASSERT(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  GRID_ASSERT(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
   GridBase *grid = q_tf.Grid();
 
@@ -1292,8 +1292,8 @@ void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
              SpinMatrixField &stn_corr)
 {
 
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+  GRID_ASSERT(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  GRID_ASSERT(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
   GridBase *grid = qs_ti.Grid();
 
@@ -1327,7 +1327,7 @@ void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
       coalescedWrite(vcorr[ss],result);
     });//end loop over lattice sites
   } else {
-    assert(0 && "Weak Operator not correctly specified");
+    GRID_ASSERT(0 && "Weak Operator not correctly specified");
   }
 }
 
@@ -1345,8 +1345,8 @@ void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
              SpinMatrixField &stn_corr)
 {
 
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+  GRID_ASSERT(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  GRID_ASSERT(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
   GridBase *grid = qs_ti.Grid();
 
@@ -1383,7 +1383,7 @@ void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
       coalescedWrite(vcorr[ss],result);
     });//end loop over lattice sites
   } else {
-    assert(0 && "Weak Operator not correctly specified");
+    GRID_ASSERT(0 && "Weak Operator not correctly specified");
   }
 }
 
@@ -1538,8 +1538,8 @@ void BaryonUtils<FImpl>::XiToSigmaEye(const PropagatorField &qq_loop,
 						 SpinMatrixField &xts_corr)
 {
 
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+  GRID_ASSERT(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  GRID_ASSERT(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
   GridBase *grid = qs_ti.Grid();
 
@@ -1574,7 +1574,7 @@ void BaryonUtils<FImpl>::XiToSigmaEye(const PropagatorField &qq_loop,
       coalescedWrite(vcorr[ss],result);
     }  );//end loop over lattice sites
   } else {
-    assert(0 && "Weak Operator not correctly specified");
+    GRID_ASSERT(0 && "Weak Operator not correctly specified");
   }
 }
 
diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h
index 583c0f2b..2583ece5 100644
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@@ -242,7 +242,7 @@ namespace ConjugateBC {
   {
     GridBase *grid = Link.Grid();
     int Lmu = grid->GlobalDimensions()[mu];
-    assert(abs(shift) < Lmu && "Invalid shift value");
+    GRID_ASSERT(abs(shift) < Lmu && "Invalid shift value");
 
     Lattice<iScalar<vInteger>> coor(grid);
     LatticeCoordinate(coor, mu);
diff --git a/Grid/qcd/utils/CovariantLaplacian.h b/Grid/qcd/utils/CovariantLaplacian.h
index 94322507..e377b791 100644
--- a/Grid/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@@ -91,9 +91,9 @@ public:
 
   };
 
-  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
-  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);}
-  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}
+  void Mdir(const GaugeField&, GaugeField&, int, int){ GRID_ASSERT(0);}
+  void MdirAll(const GaugeField&, std::vector<GaugeField> &){ GRID_ASSERT(0);}
+  void Mdiag(const GaugeField&, GaugeField&){ GRID_ASSERT(0);}
 
   void ImportGauge(const GaugeField& _U) {
     for (int mu = 0; mu < Nd; mu++) {
diff --git a/Grid/qcd/utils/GaugeFix.h b/Grid/qcd/utils/GaugeFix.h
index fc723fe3..95c4dd48 100644
--- a/Grid/qcd/utils/GaugeFix.h
+++ b/Grid/qcd/utils/GaugeFix.h
@@ -142,7 +142,7 @@ public:
     }
     std::cout << GridLogError << "Gauge fixing did not converge in " << maxiter << " iterations." << std::endl;
     if (err_on_no_converge)
-      assert(0 && "Gauge fixing did not converge within the specified number of iterations");
+      GRID_ASSERT(0 && "Gauge fixing did not converge within the specified number of iterations");
   };
   static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) {
     GridBase *grid = U[0].Grid();
diff --git a/Grid/qcd/utils/GaugeGroupTwoIndex.h b/Grid/qcd/utils/GaugeGroupTwoIndex.h
index 85685ddb..60d10299 100644
--- a/Grid/qcd/utils/GaugeGroupTwoIndex.h
+++ b/Grid/qcd/utils/GaugeGroupTwoIndex.h
@@ -165,7 +165,7 @@ public:
   template <class cplx>
   static void base(int Index, iGroupMatrix<cplx> &eij) {
   // returns (e)^(ij)_{kl} necessary for change of base U_F -> U_R
-    assert(Index < Dimension);
+    GRID_ASSERT(Index < Dimension);
     eij = Zero();
   // for the linearisation of the 2 indexes
     static int a[ncolour * (ncolour - 1) / 2][2];  // store the a <-> i,j
@@ -243,7 +243,7 @@ public:
     for (int a = 0; a < NumGenerators; a++) {
       generator(a, i2indTa);
       std::cout << GridLogMessage << a << std::endl;
-      assert(norm2(trace(i2indTa)) < 1.0e-6);
+      GRID_ASSERT(norm2(trace(i2indTa)) < 1.0e-6);
     }
     std::cout << GridLogMessage << std::endl;
 
@@ -252,7 +252,7 @@ public:
     for (int a = 0; a < NumGenerators; a++) {
       generator(a, i2indTa);
       std::cout << GridLogMessage << a << std::endl;
-      assert(norm2(adj(i2indTa) + i2indTa) < 1.0e-6);
+      GRID_ASSERT(norm2(adj(i2indTa) + i2indTa) < 1.0e-6);
     }
 
     std::cout << GridLogMessage << std::endl;
@@ -269,11 +269,11 @@ public:
         std::cout << GridLogMessage << "a=" << a << "b=" << b << "Tr=" << Tr
                   << std::endl;
         if (a == b) {
-          assert(real(Tr) - ((ncolour + S * 2) * 0.5) < 1e-8);
+          GRID_ASSERT(real(Tr) - ((ncolour + S * 2) * 0.5) < 1e-8);
         } else {
-          assert(real(Tr) < 1e-8);
+          GRID_ASSERT(real(Tr) < 1e-8);
         }
-        assert(imag(Tr) < 1e-8);
+        GRID_ASSERT(imag(Tr) < 1e-8);
       }
     }
     std::cout << GridLogMessage << std::endl;
diff --git a/Grid/qcd/utils/SUn.impl.h b/Grid/qcd/utils/SUn.impl.h
index d049fcd0..bcccc7c4 100644
--- a/Grid/qcd/utils/SUn.impl.h
+++ b/Grid/qcd/utils/SUn.impl.h
@@ -313,7 +313,7 @@ static void SubGroupHeatBath(
   // Debug test for sanity
   uinv = adj(u);
   b = u * uinv - 1.0;
-  assert(norm2(b) < 1.0e-4);
+  GRID_ASSERT(norm2(b) < 1.0e-4);
 
   /*
     Measure: Haar measure dh has d^4a delta(1-|a^2|)
@@ -452,22 +452,22 @@ static void SubGroupHeatBath(
   u = Zero();
   check = ua * adj(ua) - 1.0;
   check = where(Accepted, check, u);
-  assert(norm2(check) < 1.0e-4);
+  GRID_ASSERT(norm2(check) < 1.0e-4);
 
   check = b * adj(b) - 1.0;
   check = where(Accepted, check, u);
-  assert(norm2(check) < 1.0e-4);
+  GRID_ASSERT(norm2(check) < 1.0e-4);
 
   LatticeMatrix Vcheck(grid);
   Vcheck = Zero();
   Vcheck = where(Accepted, V * adj(V) - 1.0, Vcheck);
   //    std::cout<<GridLogMessage << "SU3 check " <<norm2(Vcheck)<<std::endl;
-  assert(norm2(Vcheck) < 1.0e-4);
+  GRID_ASSERT(norm2(Vcheck) < 1.0e-4);
 
   // Verify the link stays in SU(3)
   //    std::cout<<GridLogMessage <<"Checking the modified link"<<std::endl;
   Vcheck = link * adj(link) - 1.0;
-  assert(norm2(Vcheck) < 1.0e-4);
+  GRID_ASSERT(norm2(Vcheck) < 1.0e-4);
   /////////////////////////////////
 }
 
@@ -485,8 +485,8 @@ static void testGenerators(GroupName::SU) {
       Complex tr = TensorRemove(trace(ta * tb));
       std::cout << GridLogMessage << "(" << a << "," << b << ") =  " << tr
                 << std::endl;
-      if (a == b) assert(abs(tr - Complex(0.5)) < 1.0e-6);
-      if (a != b) assert(abs(tr) < 1.0e-6);
+      if (a == b) GRID_ASSERT(abs(tr - Complex(0.5)) < 1.0e-6);
+      if (a != b) GRID_ASSERT(abs(tr) < 1.0e-6);
     }
     std::cout << GridLogMessage << std::endl;
   }
@@ -495,7 +495,7 @@ static void testGenerators(GroupName::SU) {
   for (int a = 0; a < AdjointDimension; a++) {
     generator(a, ta);
     std::cout << GridLogMessage << a << std::endl;
-    assert(norm2(ta - adj(ta)) < 1.0e-6);
+    GRID_ASSERT(norm2(ta - adj(ta)) < 1.0e-6);
   }
   std::cout << GridLogMessage << std::endl;
 
@@ -505,7 +505,7 @@ static void testGenerators(GroupName::SU) {
     generator(a, ta);
     Complex tr = TensorRemove(trace(ta));
     std::cout << GridLogMessage << a << " " << std::endl;
-    assert(abs(tr) < 1.0e-6);
+    GRID_ASSERT(abs(tr) < 1.0e-6);
   }
   std::cout << GridLogMessage << std::endl;
 }
diff --git a/Grid/qcd/utils/SUnAdjoint.h b/Grid/qcd/utils/SUnAdjoint.h
index cfc48bbf..5924760c 100644
--- a/Grid/qcd/utils/SUnAdjoint.h
+++ b/Grid/qcd/utils/SUnAdjoint.h
@@ -95,7 +95,7 @@ public:
     for (int a = 0; a < Dimension; a++) {
       generator(a, adjTa);
       std::cout << GridLogMessage << a << std::endl;
-      assert(norm2(adjTa - conjugate(adjTa)) < 1.0e-6);
+      GRID_ASSERT(norm2(adjTa - conjugate(adjTa)) < 1.0e-6);
     }
     std::cout << GridLogMessage << std::endl;
 
@@ -104,7 +104,7 @@ public:
     for (int a = 0; a < Dimension; a++) {
       generator(a, adjTa);
       std::cout << GridLogMessage << a << std::endl;
-      assert(norm2(adjTa + transpose(adjTa)) < 1.0e-6);
+      GRID_ASSERT(norm2(adjTa + transpose(adjTa)) < 1.0e-6);
     }
     std::cout << GridLogMessage << std::endl;
   }
diff --git a/Grid/qcd/utils/Sp2n.impl.h b/Grid/qcd/utils/Sp2n.impl.h
index 196aba7e..5e57c491 100644
--- a/Grid/qcd/utils/Sp2n.impl.h
+++ b/Grid/qcd/utils/Sp2n.impl.h
@@ -209,7 +209,7 @@ static void generatorZtype(int zIndex, iGroupMatrix<cplx> &ta) {
 template <ONLY_IF_Sp>
 static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
   const int nsp=ncolour/2;
-  assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2));
+  GRID_ASSERT((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2));
 
   int spare = su2_index;
   for (i1 = 0; spare >= (nsp - 1 - i1); i1++) {
@@ -231,8 +231,8 @@ static void testGenerators(GroupName::Sp) {
       Complex tr = TensorRemove(trace(ta * tb));
       std::cout << GridLogMessage << "(" << a << "," << b << ") =  " << tr
                 << std::endl;
-      if (a == b) assert(abs(tr - Complex(0.5)) < 1.0e-6);
-      if (a != b) assert(abs(tr) < 1.0e-6);
+      if (a == b) GRID_ASSERT(abs(tr - Complex(0.5)) < 1.0e-6);
+      if (a != b) GRID_ASSERT(abs(tr) < 1.0e-6);
     }
   }
   std::cout << GridLogMessage << std::endl;
@@ -241,7 +241,7 @@ static void testGenerators(GroupName::Sp) {
   for (int a = 0; a < AlgebraDimension; a++) {
     generator(a, ta);
     std::cout << GridLogMessage << a << std::endl;
-    assert(norm2(ta - adj(ta)) < 1.0e-6);
+    GRID_ASSERT(norm2(ta - adj(ta)) < 1.0e-6);
   }
   std::cout << GridLogMessage << std::endl;
   std::cout << GridLogMessage << "Fundamental - Checking if traceless"
@@ -250,13 +250,13 @@ static void testGenerators(GroupName::Sp) {
     generator(a, ta);
     Complex tr = TensorRemove(trace(ta));
     std::cout << GridLogMessage << a << std::endl;
-    assert(abs(tr) < 1.0e-6);
+    GRID_ASSERT(abs(tr) < 1.0e-6);
   }
 }
 
-template <int N>
-static Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > >
-ProjectOnGeneralGroup(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu, GroupName::Sp) {
+template <class vtype, int N>
+static Lattice<iScalar<iScalar<iMatrix<vtype, N> > > >
+ProjectOnGeneralGroup(const Lattice<iScalar<iScalar<iMatrix<vtype, N> > > > &Umu, GroupName::Sp) {
   return ProjectOnSpGroup(Umu);
 }
 
diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h
index 7466f4bf..8091cbc8 100644
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -168,7 +168,7 @@ public:
   static std::vector<RealD> timesliceAvgSpatialPlaquette(const GaugeLorentz &Umu) {
     std::vector<RealD> sumplaq = timesliceSumSpatialPlaquette(Umu);
     int Lt = Umu.Grid()->FullDimensions()[Nd-1];
-    assert(sumplaq.size() == Lt);
+    GRID_ASSERT(sumplaq.size() == Lt);
     double vol = Umu.Grid()->gSites() / Lt;
     double faces = (1.0 * (Nd - 1)* (Nd - 2)) / 2.0;
     for(int t=0;t<Lt;t++)
@@ -177,25 +177,43 @@ public:
   }
 
   //////////////////////////////////////////////////
-  // average over all x,y,z the temporal loop
+  // average Polyakov loop in mu direction over all directions != mu
   //////////////////////////////////////////////////
-  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
-    GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu, const int mu) {  //assume Nd=4
+    
+    // Protect against bad value of mu [0, 3]
+    if ((mu < 0 ) || (mu > 3)) {
+      std::cout << GridLogError << "Index is not an integer inclusively between 0 and 3." << std::endl;
+      exit(1);
+    }
+
+    // U_loop is U_{mu}
+    GaugeMat U_loop(Umu.Grid()), P(Umu.Grid());
     ComplexD out;
     int T = Umu.Grid()->GlobalDimensions()[3];
     int X = Umu.Grid()->GlobalDimensions()[0];
     int Y = Umu.Grid()->GlobalDimensions()[1];
     int Z = Umu.Grid()->GlobalDimensions()[2];
 
-    Ut = peekLorentz(Umu,3); //Select temporal direction
-    P = Ut;
-    for (int t=1;t<T;t++){ 
-      P = Gimpl::CovShiftForward(Ut,3,P);
+    // Number of sites in mu direction
+    int N_mu = Umu.Grid()->GlobalDimensions()[mu];
+
+    U_loop = peekLorentz(Umu, mu); //Select direction
+    P = U_loop;
+    for (int t=1;t<N_mu;t++){ 
+      P = Gimpl::CovShiftForward(U_loop,mu,P);
     }
    RealD norm = 1.0/(Nc*X*Y*Z*T);
    out = sum(trace(P))*norm;
    return out;   
-}
+  }  
+
+  /////////////////////////////////////////////////
+  // overload for temporal Polyakov loop
+  /////////////////////////////////////////////////
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu) { 
+    return avgPolyakovLoop(Umu, 3);
+  }
 
   //////////////////////////////////////////////////
   // average over traced single links
@@ -347,7 +365,7 @@ public:
   //U: link array (Nd)
   /////////////
   static void StapleAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U) {
-    assert(staple.size() == Nd); assert(U.size() == Nd);
+    GRID_ASSERT(staple.size() == Nd); GRID_ASSERT(U.size() == Nd);
     for(int mu=0;mu<Nd;mu++) Staple(staple[mu], U, mu);
   }
 
@@ -373,7 +391,7 @@ public:
   public:
     //Get the stencil. If not already generated, or if generated using a different Grid than in PaddedCell, it will be created on-the-fly
     const GeneralLocalStencil & getStencil(const PaddedCell &pcell){
-      assert(pcell.depth >= this->paddingDepth());
+      GRID_ASSERT(pcell.depth >= this->paddingDepth());
       if(!stencil || stencil->Grid() != (GridBase*)pcell.grids.back() ) generateStencil((GridBase*)pcell.grids.back());
       return *stencil;
     }
@@ -391,7 +409,7 @@ public:
     std::unique_ptr<PaddedCell> pcell;
 
     void generatePcell(GridBase* unpadded_grid){
-      assert(stencil_wk.size());
+      GRID_ASSERT(stencil_wk.size());
       int max_depth = 0;
       for(auto const &s : stencil_wk) max_depth=std::max(max_depth, s->paddingDepth());
       
@@ -402,7 +420,7 @@ public:
     //Add a stencil definition. This should be done before the first call to retrieve a stencil object.
     //Takes ownership of the pointer
     void addStencil(WilsonLoopPaddedStencilWorkspace *stencil){
-      assert(!pcell);
+      GRID_ASSERT(!pcell);
       stencil_wk.push_back(stencil);
     }
 
@@ -469,9 +487,9 @@ public:
   static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil)
   {
     double t0 = usecond();
-    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
-    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
-    assert(Cell.depth >= 1);
+    GRID_ASSERT(U_padded.size() == Nd); GRID_ASSERT(staple.size() == Nd);
+    GRID_ASSERT(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
+    GRID_ASSERT(Cell.depth >= 1);
     GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
 
     int shift_mu_off = gStencil._npoints/Nd;
@@ -622,7 +640,7 @@ public:
 
   static Real TopologicalCharge(const GaugeLorentz &U){
     // 4d topological charge
-    assert(Nd==4);
+    GRID_ASSERT(Nd==4);
     // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
     GaugeMat Bx(U.Grid()), By(U.Grid()), Bz(U.Grid());
     FieldStrength(Bx, U, Ydir, Zdir);
@@ -740,7 +758,7 @@ public:
   //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf  Eq 6
   //output is the charge by timeslice: sum over timeslices to obtain the total
   static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
-    assert(Nd == 4);
+    GRID_ASSERT(Nd == 4);
     std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
     //Note F_numu = - F_munu
     //hence we only need to loop over mu,nu,rho,sigma that aren't related by permuting mu,nu  or rho,sigma
@@ -1095,7 +1113,7 @@ public:
   //U: Gauge links in each direction (Nd)
   /////////////////////////////////////////////////////
   static void RectStapleAll(std::vector<GaugeMat> &Stap, const std::vector<GaugeMat> &U){
-    assert(Stap.size() == Nd); assert(U.size() == Nd);
+    GRID_ASSERT(Stap.size() == Nd); GRID_ASSERT(U.size() == Nd);
     std::vector<GaugeMat> U2(Nd,U[0].Grid());
     for(int mu=0;mu<Nd;mu++) RectStapleDouble(U2[mu], U[mu], mu);
     for(int mu=0;mu<Nd;mu++) RectStapleOptimised(Stap[mu], U2, U, mu);
@@ -1179,9 +1197,9 @@ public:
   //gStencil: the stencil
   static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
     double t0 = usecond();
-    assert(U_padded.size() == Nd); assert(staple.size() == Nd);
-    assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
-    assert(Cell.depth >= 2);
+    GRID_ASSERT(U_padded.size() == Nd); GRID_ASSERT(staple.size() == Nd);
+    GRID_ASSERT(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
+    GRID_ASSERT(Cell.depth >= 2);
     GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
 
     size_t nshift = gStencil._npoints;
diff --git a/Grid/serialisation/BaseIO.h b/Grid/serialisation/BaseIO.h
index 25481301..45f91b1d 100644
--- a/Grid/serialisation/BaseIO.h
+++ b/Grid/serialisation/BaseIO.h
@@ -307,14 +307,14 @@ namespace Grid {
     constexpr unsigned int ContainerRank{Traits::Rank}; // Only non-zero for containers
     constexpr unsigned int TotalRank{TensorRank + ContainerRank};
     const Index NumElements{output.size()};
-    assert( NumElements > 0 );
+    GRID_ASSERT( NumElements > 0 );
 
     // Get the dimensionality of the tensor
     std::vector<std::size_t>  TotalDims(TotalRank);
     for(auto i = 0; i < TensorRank; i++ ) {
       auto dim = output.dimension(i);
       TotalDims[i] = static_cast<size_t>(dim);
-      assert( TotalDims[i] == dim ); // check we didn't lose anything in the conversion
+      GRID_ASSERT( TotalDims[i] == dim ); // check we didn't lose anything in the conversion
     }
     for(auto i = 0; i < ContainerRank; i++ )
       TotalDims[TensorRank + i] = Traits::Dimension(i);
@@ -452,7 +452,7 @@ namespace Grid {
     std::vector<std::size_t> dimData;
     std::vector<Scalar> buf;
     upcast->readMultiDim( s, buf, dimData );
-    assert(dimData.size() == TotalRank && "EigenIO: Tensor rank mismatch" );
+    GRID_ASSERT(dimData.size() == TotalRank && "EigenIO: Tensor rank mismatch" );
     // Make sure that the number of elements read matches dimensions read
     std::size_t NumContainers = 1;
     for( auto i = 0 ; i < TensorRank ; i++ )
@@ -460,10 +460,10 @@ namespace Grid {
     // If our scalar object is a Container, make sure it's dimensions match what we read back
     std::size_t ElementsPerContainer = 1;
     for( auto i = 0 ; i < ContainerRank ; i++ ) {
-      assert( dimData[TensorRank+i] == Traits::Dimension(i) && "Tensor Container dimensions don't match data" );
+      GRID_ASSERT( dimData[TensorRank+i] == Traits::Dimension(i) && "Tensor Container dimensions don't match data" );
       ElementsPerContainer *= dimData[TensorRank+i];
     }
-    assert( NumContainers * ElementsPerContainer == buf.size() && "EigenIO: Number of elements != product of dimensions" );
+    GRID_ASSERT( NumContainers * ElementsPerContainer == buf.size() && "EigenIO: Number of elements != product of dimensions" );
     // Now see whether the tensor is the right shape, or can be made to be
     const auto & dims = output.dimensions();
     bool bShapeOK = (output.data() != nullptr);
@@ -487,7 +487,7 @@ namespace Grid {
       for( int i = TensorRank - 1; i != -1 && ++MyIndex[i] == dims[i]; i-- )
         MyIndex[i] = 0;
     }
-    assert( pSource == &buf[NumContainers * ElementsPerContainer] );
+    GRID_ASSERT( pSource == &buf[NumContainers * ElementsPerContainer] );
   }
 
   template <typename T>
@@ -495,7 +495,7 @@ namespace Grid {
   typename std::enable_if<EigenIO::is_tensor_fixed<ETensor>::value, void>::type
   Reader<T>::Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims )
   {
-    assert( 0 && "EigenIO: Fixed tensor dimensions can't be changed" );
+    GRID_ASSERT( 0 && "EigenIO: Fixed tensor dimensions can't be changed" );
   }
 
   template <typename T>
@@ -505,7 +505,7 @@ namespace Grid {
   {
 #ifdef GRID_OMP
     // The memory counter is the reason this must be done from the primary thread
-    assert(omp_in_parallel()==0 && "Deserialisation which resizes Eigen tensor must happen from primary thread");
+    GRID_ASSERT(omp_in_parallel()==0 && "Deserialisation which resizes Eigen tensor must happen from primary thread");
 #endif
     EigenIO::EigenResizeCounter -= static_cast<uint64_t>(t.size()) * sizeof(typename ETensor::Scalar);
     //t.reshape( dims );
@@ -561,7 +561,7 @@ namespace Grid {
     template <typename T1, typename T2>
     static inline typename std::enable_if<EigenIO::is_tensor<T1>::value && EigenIO::is_tensor<T2>::value, bool>::type
     CompareMember(const T1 &lhs, const T2 &rhs) {
-      // First check whether dimensions match (Eigen tensor library will assert if they don't match)
+      // First check whether dimensions match (Eigen tensor library will GRID_ASSERT if they don't match)
       bool bReturnValue = (T1::NumIndices == T2::NumIndices);
       for( auto i = 0 ; bReturnValue && i < T1::NumIndices ; i++ )
           bReturnValue = ( lhs.dimension(i) == rhs.dimension(i) );
@@ -593,7 +593,7 @@ namespace Grid {
     WriteMember(std::ostream &os, const T &object) {
       using Index = typename T::Index;
       const Index NumElements{object.size()};
-      assert( NumElements > 0 );
+      GRID_ASSERT( NumElements > 0 );
       Index count = 1;
       os << "T<";
       for( int i = 0; i < T::NumIndices; i++ ) {
@@ -603,7 +603,7 @@ namespace Grid {
           os << ",";
         os << dim;
       }
-      assert( count == NumElements && "Number of elements doesn't match tensor dimensions" );
+      GRID_ASSERT( count == NumElements && "Number of elements doesn't match tensor dimensions" );
       os << ">{";
       const typename T::Scalar * p = object.data();
       for( Index i = 0; i < count; i++ ) {
diff --git a/Grid/serialisation/BinaryIO.h b/Grid/serialisation/BinaryIO.h
index 9ba34c16..afb23640 100644
--- a/Grid/serialisation/BinaryIO.h
+++ b/Grid/serialisation/BinaryIO.h
@@ -103,7 +103,7 @@ NAMESPACE_BEGIN(Grid);
     uint64_t tmp = 1;
     for( auto i = 0 ; i < rank ; i++ )
       tmp *= Dimensions[i];
-    assert( tmp == NumElements && "Dimensions don't match size of data being written" );
+    GRID_ASSERT( tmp == NumElements && "Dimensions don't match size of data being written" );
     // Total number of elements
     write("", tmp);
     // Number of dimensions
@@ -158,7 +158,7 @@ NAMESPACE_BEGIN(Grid);
       dim[i] = tmp;
       count *= tmp;
     }
-    assert( count == NumElements && "Dimensions don't match size of data being read" );
+    GRID_ASSERT( count == NumElements && "Dimensions don't match size of data being read" );
     buf.resize(count);
     for( auto i = 0; i < count; ++i)
       read("", buf[i]);
diff --git a/Grid/serialisation/Hdf5IO.h b/Grid/serialisation/Hdf5IO.h
index ae5e740b..13a49f9e 100644
--- a/Grid/serialisation/Hdf5IO.h
+++ b/Grid/serialisation/Hdf5IO.h
@@ -174,7 +174,7 @@ namespace Grid
           // Now make sure overall size is not too big
           hsize_t OverflowCheck = ElementsPerChunk;
           ElementsPerChunk *= d;
-          assert( OverflowCheck == ElementsPerChunk / d && "Product of dimensions overflowed hsize_t" );
+          GRID_ASSERT( OverflowCheck == ElementsPerChunk / d && "Product of dimensions overflowed hsize_t" );
           // If product of dimensions too big, reduce by prime factors
           while( ElementsPerChunk > MaxElements && ( ElementsPerChunk & 1 ) == 0 ) {
             bTooBig = true;
diff --git a/Grid/serialisation/VectorUtils.h b/Grid/serialisation/VectorUtils.h
index 8f490c64..72156bc8 100644
--- a/Grid/serialisation/VectorUtils.h
+++ b/Grid/serialisation/VectorUtils.h
@@ -447,9 +447,9 @@ namespace Grid {
     using Traits = GridTypeMapper<typename is_flattenable<W>::grid_type>;
     const int gridRank{Traits::Rank};
     const int dimRank{static_cast<int>(dim_.size())};
-    assert(dimRank >= gridRank && "Tensor rank too low for Grid tensor");
+    GRID_ASSERT(dimRank >= gridRank && "Tensor rank too low for Grid tensor");
     for (int i=0; i<gridRank; ++i) {
-      assert(dim_[dimRank - gridRank + i] == Traits::Dimension(i) && "Tensor dimension doesn't match Grid tensor");
+      GRID_ASSERT(dim_[dimRank - gridRank + i] == Traits::Dimension(i) && "Tensor dimension doesn't match Grid tensor");
     }
     dim_.resize(dimRank - gridRank);
   }
@@ -461,7 +461,7 @@ namespace Grid {
   , dim_(dim)
   {
     checkInnermost(vector_);
-    assert(dim_.size() == is_flattenable<V>::vecRank && "Tensor rank doesn't match nested std::vector rank");
+    GRID_ASSERT(dim_.size() == is_flattenable<V>::vecRank && "Tensor rank doesn't match nested std::vector rank");
     resize(vector_, 0);
     fill(vector_);
   }
@@ -512,14 +512,14 @@ namespace Grid {
   {
     if( bFirst)
     {
-      assert( Dims.size() == Depth     && "Bug: Delete this message after testing" );
+      GRID_ASSERT( Dims.size() == Depth     && "Bug: Delete this message after testing" );
       Dims.push_back(v[0].size());
       if (!Dims[Depth])
         return false;
     }
     else
     {
-      assert( Dims.size() >= Depth + 1 && "Bug: Delete this message after testing" );
+      GRID_ASSERT( Dims.size() >= Depth + 1 && "Bug: Delete this message after testing" );
     }
     for (std::size_t i = 0; i < v.size(); ++i)
     {
diff --git a/Grid/serialisation/XmlIO.h b/Grid/serialisation/XmlIO.h
index cb9a49e7..2b568421 100644
--- a/Grid/serialisation/XmlIO.h
+++ b/Grid/serialisation/XmlIO.h
@@ -145,7 +145,7 @@ namespace Grid
       write("dim", d);
       count *= d;
     }
-    assert( count == NumElements && "XmlIO : element count doesn't match dimensions" );
+    GRID_ASSERT( count == NumElements && "XmlIO : element count doesn't match dimensions" );
     static const char sName[] = "tensor";
     for( int i = 0 ; i < Rank ; i++ ) {
       MyIndex[i] = 0;
diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h
index 66d25bc4..8de0d34d 100644
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -52,6 +52,10 @@ class GeneralLocalStencilView {
     return & this->_entries_p[point+this->_npoints*osite]; 
   }
   void ViewClose(void){};
+#ifdef GRID_LOG_VIEWS
+  size_t size() { return 0; };
+  uint64_t & operator[](size_t i) { static uint64_t v=0; return v; };
+#endif
 };
 ////////////////////////////////////////
 // The Stencil Class itself
@@ -114,7 +118,7 @@ public:
 	    int ld = grid->_ldimensions[d];
 	    int ly = grid->_simd_layout[d];
 
-	    assert((ly==1)||(ly==2)||(ly==grid->Nsimd()));
+	    GRID_ASSERT((ly==1)||(ly==2)||(ly==grid->Nsimd()));
 
 	    int shift = (shifts[ii][d]+fd)%fd;  // make it strictly positive 0.. L-1
 	    int x = Coor[d];                // x in [0... rd-1] as an oSite 
diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h
index eca9cd3c..41c1fbe1 100644
--- a/Grid/stencil/SimpleCompressor.h
+++ b/Grid/stencil/SimpleCompressor.h
@@ -39,7 +39,7 @@ public:
 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				    compressor &compress,int type,int partial)
   {
-    assert( (table.size()&0x1)==0);
+    GRID_ASSERT( (table.size()&0x1)==0);
     int num=table.size()/2;
     int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
     
diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index 25d194b5..090b7fe0 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -277,7 +277,7 @@ protected:
     device_heap_bytes+= bytes;
     if ( device_heap_bytes > device_heap_size ) {
       std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
-      assert (device_heap_bytes <= device_heap_size);
+      GRID_ASSERT (device_heap_bytes <= device_heap_size);
     }
     return ptr;
   }
@@ -390,7 +390,7 @@ public:
     else                nbr_proc = pd-1;
 
     // FIXME  this logic needs to be sorted for three link term
-    //    assert( (displacement==1) || (displacement==-1));
+    //    GRID_ASSERT( (displacement==1) || (displacement==-1));
     // Present hack only works for >= 4^4 subvol per node
     _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
 
@@ -633,7 +633,7 @@ public:
     // Map to always positive shift modulo global full dimension.
     int shift = (displacement+fd)%fd;
 
-    assert (source.Checkerboard()== this->_checkerboard);
+    GRID_ASSERT (source.Checkerboard()== this->_checkerboard);
 
     // the permute type
     int simd_layout     = _grid->_simd_layout[dimension];
@@ -680,7 +680,7 @@ public:
     //////////////////////////////////
     _grid->StencilBarrier();// Synch shared memory on a single nodes
 
-    assert(source.Grid()==_grid);
+    GRID_ASSERT(source.Grid()==_grid);
 
     u_comm_offset=0;
 
@@ -697,7 +697,7 @@ public:
     // Or issue barrier AFTER the DMA is running
 #endif    
     face_table_computed=1;
-    assert(u_comm_offset==_unified_buffer_size);
+    GRID_ASSERT(u_comm_offset==_unified_buffer_size);
   }
 
   /////////////////////////
@@ -751,7 +751,7 @@ public:
     obj.xbytes      = xbytes;
     obj.rbytes      = rbytes;
     obj.cb          = cb;
-
+    
     for(int i=0;i<CachedTransfers.size();i++){
       if (   (CachedTransfers[i].direction  ==direction)
 	   &&(CachedTransfers[i].OrthogPlane==OrthogPlane)
@@ -763,11 +763,13 @@ public:
 	     ){
 	// FIXME worry about duplicate with partial compression
 	// Wont happen as DWF has no duplicates, but...
-	AddCopy(CachedTransfers[i].recv_buf,recv_buf,rbytes);
-	return 1;
+	//	AddCopy(CachedTransfers[i].recv_buf,recv_buf,rbytes);
+	//	std::cout << "Duplicate dir " <<direction<<" "<<" OrthogPlane "<<OrthogPlane<<" Dest"<<DestProc <<" xbytes " <<xbytes<<" lane "<< lane<<" cb "<<cb<<std::endl;
+	return 0;
+	
+	//	return 1;
       }
     }
-
     CachedTransfers.push_back(obj);
     return 0;
   }
@@ -812,8 +814,8 @@ public:
     CommsMerge(decompress,Mergers,Decompressions);
   }
   template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
-    assert(MergersSHM.size()==0);
-    assert(DecompressionsSHM.size()==0);
+    GRID_ASSERT(MergersSHM.size()==0);
+    GRID_ASSERT(DecompressionsSHM.size()==0);
   }
 
   template<class decompressor>
@@ -1003,7 +1005,7 @@ public:
       int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
       int rotate_dim      = _grid->_simd_layout[dimension]>2;
 
-      assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
+      GRID_ASSERT ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
 
       int sshift[2];
       //////////////////////////
@@ -1117,10 +1119,10 @@ public:
     int simd_layout     = _grid->_simd_layout[dimension];
     int comm_dim        = _grid->_processors[dimension] >1 ;
 
-    assert(comm_dim==1);
+    GRID_ASSERT(comm_dim==1);
     int shift = (shiftpm + fd) %fd;
-    assert(shift>=0);
-    assert(shift<fd);
+    GRID_ASSERT(shift>=0);
+    GRID_ASSERT(shift<fd);
 
     // done in reduced dims, so SIMD factored
     int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
@@ -1301,7 +1303,7 @@ public:
     //    int comms_partial_send   = this->_comms_partial_send[point] ;
     //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
     
-    assert(rhs.Grid()==_grid);
+    GRID_ASSERT(rhs.Grid()==_grid);
     //	  conformable(_grid,rhs.Grid());
 
     int fd              = _grid->_fdimensions[dimension];
@@ -1309,10 +1311,10 @@ public:
     int pd              = _grid->_processors[dimension];
     int simd_layout     = _grid->_simd_layout[dimension];
     int comm_dim        = _grid->_processors[dimension] >1 ;
-    assert(simd_layout==1);
-    assert(comm_dim==1);
-    assert(shift>=0);
-    assert(shift<fd);
+    GRID_ASSERT(simd_layout==1);
+    GRID_ASSERT(comm_dim==1);
+    GRID_ASSERT(shift>=0);
+    GRID_ASSERT(shift<fd);
 
     int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 
@@ -1450,11 +1452,11 @@ public:
     int pd              = _grid->_processors[dimension];
     int simd_layout     = _grid->_simd_layout[dimension];
     int comm_dim        = _grid->_processors[dimension] >1 ;
-    assert(comm_dim==1);
+    GRID_ASSERT(comm_dim==1);
     // This will not work with a rotate dim
-    assert(simd_layout==maxl);
-    assert(shift>=0);
-    assert(shift<fd);
+    GRID_ASSERT(simd_layout==maxl);
+    GRID_ASSERT(shift>=0);
+    GRID_ASSERT(shift<fd);
 
 
     int permute_type=_grid->PermuteType(dimension);
@@ -1465,8 +1467,8 @@ public:
     int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
     //    int words = sizeof(cobj)/sizeof(vector_type);
 
-    assert(cbmask==0x3); // Fixme think there is a latent bug if not true
-                         // This assert will trap it if ever hit. Not hit normally so far
+    GRID_ASSERT(cbmask==0x3); // Fixme think there is a latent bug if not true
+                         // This GRID_ASSERT will trap it if ever hit. Not hit normally so far
     int reduced_buffer_size = buffer_size;
     if (cbmask != 0x3) reduced_buffer_size=buffer_size>>1;
 
@@ -1477,7 +1479,7 @@ public:
     int xbytes; 
     int rbytes; 
     
-    assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
+    GRID_ASSERT(bytes*simd_layout == reduced_buffer_size*datum_bytes);
 
     std::vector<cobj *> rpointers(maxl);
     std::vector<cobj *> spointers(maxl);
@@ -1550,7 +1552,7 @@ public:
 	  int nbr_ox   = (nbr_lcoor%rd);      // outer coord of peer "x"
 
 	  int nbr_plane = nbr_ic;
-	  assert (sx == nbr_ox);
+	  GRID_ASSERT (sx == nbr_ox);
 
 	  auto rp = &u_simd_recv_buf[i        ][comm_off];
 	  auto sp = &u_simd_send_buf[nbr_plane][comm_off];
diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h
index e63e228c..a94dabc4 100644
--- a/Grid/tensors/Tensor_class.h
+++ b/Grid/tensors/Tensor_class.h
@@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////////////////
 
 // It is useful to NOT have any constructors
-// so that these classes assert "is_pod<class> == true"
+// so that these classes GRID_ASSERT "is_pod<class> == true"
 // because then the standard C++ valarray container eliminates fill overhead on
 // new allocation and
 // non-move copying.
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 2b91165c..2c18796d 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -205,7 +205,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
 	     cudaGetErrorString( err ));				\
       printf("File %s Line %d\n",__FILE__,__LINE__);			\
       fflush(stdout);							\
-      if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);		\
+      if (acceleratorAbortOnGpuError) GRID_ASSERT(err==cudaSuccess);		\
     }									\
   }
 
@@ -216,7 +216,7 @@ inline void *acceleratorAllocHost(size_t bytes)
   if( err != cudaSuccess ) {
     ptr = (void *) NULL;
     printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
-    assert(0);
+    GRID_ASSERT(0);
   }
   return ptr;
 }
@@ -227,7 +227,7 @@ inline void *acceleratorAllocShared(size_t bytes)
   if( err != cudaSuccess ) {
     ptr = (void *) NULL;
     printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
-    assert(0);
+    GRID_ASSERT(0);
   }
   return ptr;
 };
@@ -276,7 +276,7 @@ inline int  acceleratorIsCommunicable(void *ptr)
   //  int uvm=0;
   //  auto 
   //  cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
-  //  assert(cuerr == cudaSuccess );
+  //  GRID_ASSERT(cuerr == cudaSuccess );
   //  if(uvm) return 0;
   //  else    return 1;
     return 1;
diff --git a/Grid/threads/ThreadReduction.h b/Grid/threads/ThreadReduction.h
index 2cb3e90d..05b5ce02 100644
--- a/Grid/threads/ThreadReduction.h
+++ b/Grid/threads/ThreadReduction.h
@@ -66,7 +66,7 @@ public:
     _threads = 1;
 #endif
   };
-  static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
+  static int GetHyperThreads(void) { GRID_ASSERT(_threads%_cores ==0); return _threads/_cores; };
   static int GetCores(void)   { return _cores; };
   static int GetThreads(void) { return _threads; };
   static int SumArraySize(void) {return _threads;};
diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h
index cdb4fa62..c517ca13 100644
--- a/Grid/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -77,7 +77,7 @@ inline void thread_bcopy(const void *from, void *to,size_t bytes)
 {
   const uint64_t *ufrom = (const uint64_t *)from;
   uint64_t *uto   = (uint64_t *)to;
-  assert(bytes%8==0);
+  GRID_ASSERT(bytes%8==0);
   uint64_t words=bytes/8;
   thread_for(w,words,{
       uto[w] = ufrom[w];
diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc
index 466ce071..b5baeea9 100644
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@@ -47,6 +47,7 @@ int32_t  FlightRecorder::CsumLoggingCounter;
 int32_t  FlightRecorder::NormLoggingCounter;
 int32_t  FlightRecorder::ReductionLoggingCounter;
 uint64_t FlightRecorder::ErrorCounter;
+
 std::vector<double> FlightRecorder::NormLogVector;
 std::vector<double> FlightRecorder::ReductionLogVector;
 std::vector<uint64_t> FlightRecorder::CsumLogVector;
@@ -89,7 +90,7 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
     Truncate();
     break;
   default:
-    assert(0);
+    GRID_ASSERT(0);
   }
 }
 bool FlightRecorder::StepLog(const char *name)
@@ -260,7 +261,7 @@ void FlightRecorder::ReductionLog(double local,double global)
 		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
 	BACKTRACEFP(stderr);
 	
-	if ( !ContinueOnFail ) assert(0);
+	if ( !ContinueOnFail ) GRID_ASSERT(0);
 
 	ErrorCounter++;
       } else {
@@ -308,7 +309,7 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
 		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
 	BACKTRACEFP(stderr);
 	
-	if ( !ContinueOnFail ) assert(0);
+	if ( !ContinueOnFail ) GRID_ASSERT(0);
 
 	ErrorCounter++;
       } else {
@@ -354,7 +355,7 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
 		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
 	BACKTRACEFP(stderr);
 	
-	if ( !ContinueOnFail ) assert(0);
+	if ( !ContinueOnFail ) GRID_ASSERT(0);
 
 	ErrorCounter++;
       } else {
@@ -379,8 +380,12 @@ std::vector<ViewLogger::Entry_t> ViewLogger::LogVector;
 
 void ViewLogger::Begin() { Enabled = true; LogVector.resize(0); }
 void ViewLogger::End() { Enabled = false; }
-
-void ViewLogger::Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes)
+#ifdef GRID_LOG_VIEWS_FENCEPOST
+void ViewLogger::LogOpen(const char* filename, int line, int index, int mode, void* data, uint64_t bytes)
+{
+  ViewLogger::LogClose(filename,line,index,mode,data,bytes);
+}
+void ViewLogger::LogClose(const char* filename, int line, int index, int mode, void* data, uint64_t bytes)
 {
   if (!Enabled)
    return;
@@ -416,6 +421,27 @@ void ViewLogger::Log(const char* filename, int line, int index, int mode, void*
     }
   }
 }
+#else
+void ViewLogger::LogOpen(const char* filename, int line, int index, int mode, void* data, uint64_t bytes){ }
+void ViewLogger::LogClose(const char* filename, int line, int index, int mode, void* data, uint64_t bytes)
+{
+  if (!Enabled)
+   return;
+
+  if (bytes < sizeof(uint64_t)) return;
+   
+#ifdef GRID_SYCL
+  uint64_t *u_data = (uint64_t *)data;
+  switch (mode) {
+    case AcceleratorWrite:
+    case AcceleratorWriteDiscard:
+      uint64_t csum = checksum_gpu(u_data,bytes/sizeof(uint64_t));
+      FlightRecorder::CsumLog(csum);
+      break;
+  }
+#endif
+}
+#endif
 
 #endif
 
diff --git a/Grid/util/FlightRecorder.h b/Grid/util/FlightRecorder.h
index 7cad5a61..63ff2d04 100644
--- a/Grid/util/FlightRecorder.h
+++ b/Grid/util/FlightRecorder.h
@@ -56,7 +56,8 @@ public:
   static std::vector<Entry_t> LogVector;
   static void Begin();
   static void End();
-  static void Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes);
+  static void LogOpen(const char* filename, int line, int index, int mode, void* data, uint64_t bytes);
+  static void LogClose(const char* filename, int line, int index, int mode, void* data, uint64_t bytes);
 };
 #endif
 NAMESPACE_END(Grid);
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 90f48b7f..23061372 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -85,6 +85,8 @@ feenableexcept (unsigned int excepts)
 #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 #endif
 
+void * Grid_backtrace_buffer[_NBACKTRACE];
+
 NAMESPACE_BEGIN(Grid);
 
 //////////////////////////////////////////////////////
@@ -118,7 +120,7 @@ const Coordinate GridDefaultSimd(int dims,int nsimd)
       layout[d]=1;
     }
   }
-  assert(nn==1);
+  GRID_ASSERT(nn==1);
   return layout;
 }
 
@@ -213,14 +215,14 @@ void GridParseLayout(char **argv,int argc,
 #endif
     arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
     GridCmdOptionIntVector(arg,ompthreads);
-    assert(ompthreads.size()==1);
+    GRID_ASSERT(ompthreads.size()==1);
     GridThread::SetThreads(ompthreads[0]);
   }
   if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){
     std::vector<int> gputhreads(0);
     arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads");
     GridCmdOptionIntVector(arg,gputhreads);
-    assert(gputhreads.size()==1);
+    GRID_ASSERT(gputhreads.size()==1);
     acceleratorThreads(gputhreads[0]);
   }
 
@@ -232,7 +234,7 @@ void GridParseLayout(char **argv,int argc,
   }
   // Copy back into coordinate format
   int nd = mpi.size();
-  assert(latt.size()==nd);
+  GRID_ASSERT(latt.size()==nd);
   latt_c.resize(nd);
    mpi_c.resize(nd);
   for(int d=0;d<nd;d++){
@@ -315,8 +317,8 @@ std::vector<dlRegion> dlMap;
 
 void Grid_init(int *argc,char ***argv)
 {
-
-  assert(Grid_is_initialised == 0);
+  
+  GRID_ASSERT(Grid_is_initialised == 0);
 
   GridLogger::GlobalStopWatch.Start();
 
@@ -361,24 +363,6 @@ void Grid_init(int *argc,char ***argv)
     GlobalSharedMemory::Hugepages = 1;
   }
 
-
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
-    Grid_debug_handler_init();
-  }
-  // Sleep n-seconds at end of handler
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
-    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
-    GridCmdOptionInt(arg,signal_delay);
-  }
-  // periodic wakeup with stack trace printed
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
-    Grid_debug_heartbeat();
-  }
-  // periodic wakeup with empty handler (interrupts some system calls)
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
-    Grid_heartbeat();
-  }
-
 #if defined(A64FX)
   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
     std::cout << "Option --comms-overlap currently not supported on QPACE4. Exiting." << std::endl;
@@ -418,7 +402,7 @@ void Grid_init(int *argc,char ***argv)
     std::ostringstream fname;
 
     int rank = CartesianCommunicator::RankWorld();
-    int radix=64;
+    int radix=32;
     char* root = getenv("GRID_STDOUT_ROOT");
     if (root) {
       fname << root ;
@@ -430,8 +414,11 @@ void Grid_init(int *argc,char ***argv)
     fname << "/";
     fname<<"Grid.stdout.";
     fname<<CartesianCommunicator::RankWorld();
+
+    std::cout << " Reconnecting stdout to "<<fname.str()<<std::endl;
+    
     fp=freopen(fname.str().c_str(),"w",stdout);
-    assert(fp!=(FILE *)NULL);
+    GRID_ASSERT(fp!=(FILE *)NULL);
 
     std::ostringstream ename;
     if (root){
@@ -440,12 +427,14 @@ void Grid_init(int *argc,char ***argv)
     ename << (rank/radix)*radix << "/";
     ename<<"Grid.stderr.";
     ename<<CartesianCommunicator::RankWorld();
+    std::cout << " Reconnecting stderr to "<<ename.str()<<std::endl;
     fp=freopen(ename.str().c_str(),"w",stderr);
-    assert(fp!=(FILE *)NULL);
+    GRID_ASSERT(fp!=(FILE *)NULL);
   }
   fileno_stdout = fileno(stdout);
   fileno_stderr = fileno(stderr) ;
-    
+  dup2(fileno_stdout, STDOUT_FILENO);
+  dup2(fileno_stderr, STDERR_FILENO);
   ////////////////////////////////////////////////////
   // OK to use GridLogMessage etc from here on
   ////////////////////////////////////////////////////
@@ -578,7 +567,7 @@ void Grid_init(int *argc,char ***argv)
   }
 
   ////////////////////////////////////
-  // Debug and performance options
+  // Performance options
   ////////////////////////////////////
 
   if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
@@ -601,6 +590,10 @@ void Grid_init(int *argc,char ***argv)
     StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
   }
 
+  ////////////////////////////////
+  // Timestamping or not
+  ////////////////////////////////
+
   CartesianCommunicator::nCommThreads = 1;
   if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
     GridLogTimestamp(0);
@@ -608,18 +601,13 @@ void Grid_init(int *argc,char ***argv)
     GridLogTimestamp(1);
   }
 
+  ////////////////////////////////
+  // Default layout
+  ////////////////////////////////
   GridParseLayout(*argv,*argc,
 		  Grid_default_latt,
 		  Grid_default_mpi);
 
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
-    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
-    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
-    FlightRecorder::PrintEntireLog = 1;
-    FlightRecorder::ChecksumComms  = 1;
-    FlightRecorder::ChecksumCommsSend=1;
-  }
-  
   if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
     std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
     std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@@ -629,6 +617,36 @@ void Grid_init(int *argc,char ***argv)
     std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
     std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
   }
+
+  ////////////////////////////////////
+  // Debug options
+  ////////////////////////////////////
+
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
+    Grid_debug_handler_init();
+  }
+  // Sleep n-seconds at end of handler
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
+    GridCmdOptionInt(arg,signal_delay);
+  }
+  // periodic wakeup with stack trace printed
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
+    Grid_debug_heartbeat();
+  }
+  // periodic wakeup with empty handler (interrupts some system calls)
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
+    Grid_heartbeat();
+  }
+
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
+    std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
+    FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
+    FlightRecorder::PrintEntireLog = 1;
+    FlightRecorder::ChecksumComms  = 1;
+    FlightRecorder::ChecksumCommsSend=1;
+  }
+  
   Grid_is_initialised = 1;
 }
 
@@ -657,7 +675,6 @@ void GridLogLayout() {
   std::cout << GridLogMessage << "\tMPI tasks            : "<< GridCmdVectorIntToString(GridDefaultMpi()) << std::endl;
 }
 
-void * Grid_backtrace_buffer[_NBACKTRACE];
 #define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));
 
 void sig_print_dig(uint32_t dig)
@@ -841,8 +858,8 @@ void Grid_heartbeat(void)
 
   // repeating 10s heartbeat
   struct itimerval it_val;
-  it_val.it_value.tv_sec = 10;
-  it_val.it_value.tv_usec = 1000;
+  it_val.it_value.tv_sec = 0;
+  it_val.it_value.tv_usec = 10000;
   it_val.it_interval = it_val.it_value;
   setitimer(ITIMER_REAL, &it_val, NULL);
 }
@@ -859,6 +876,7 @@ void Grid_debug_handler_init(void)
   sa.sa_flags    = SA_SIGINFO;
   sigaction(SIGTRAP,&sa,NULL);
   sigaction(SIGILL,&sa,NULL);
+  sigaction(SIGABRT,&sa,NULL); // SigABRT backtrace
 #ifndef GRID_SYCL
   sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
   sigaction(SIGBUS,&sa,NULL);
diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h
index b4063e48..d494760f 100644
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@@ -52,7 +52,7 @@ namespace Grid{
       if ( index64>=2*1024*1024*1024LL ){
 	//	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
       }
-      assert(index64<2*1024*1024*1024LL);
+      GRID_ASSERT(index64<2*1024*1024*1024LL);
       index = (int) index64;
     }
     template<class coor_t>
diff --git a/HMC/ComputeWilsonFlow.cc b/HMC/ComputeWilsonFlow.cc
index b273ee40..9bc93536 100644
--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@@ -38,6 +38,7 @@ namespace Grid{
             int, meas_interval,
 	    double, maxTau, // for the adaptive algorithm
 	    int, meas_interval_density,
+	    std::string, flow_type,
 	    std::string, path); 
        
 
@@ -127,13 +128,21 @@ int main(int argc, char **argv) {
   std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
 
   WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
-  
+  WilsonGaugeAction<PeriodicGimplR>   WGA(3.0);
+  IwasakiGaugeAction<PeriodicGimplR>  IWGA(3.0);
+  SymanzikGaugeAction<PeriodicGimplR> SZGA(3.0);
+  DBW2GaugeAction<PeriodicGimplR>     DBGA(3.0);
+  if (     WFPar.flow_type == std::string("Wilson"))   WF.setGaugeAction(&WGA);
+  else if (WFPar.flow_type == std::string("Iwasaki" )) WF.setGaugeAction(&IWGA);
+  else if (WFPar.flow_type == std::string("Symanzik")) WF.setGaugeAction(&SZGA);
+  else if (WFPar.flow_type == std::string("DBW2"))     WF.setGaugeAction(&DBGA);
+    
   WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
     
     typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
     typedef typename PeriodicGimplR::ComplexField ComplexField;
     
-    assert(Nd == 4);
+    GRID_ASSERT(Nd == 4);
 
     // NOTE:
     // Ideally, turn the folloing into methods of the appropriate class
diff --git a/HMC/FTHMC2p1f.cc b/HMC/FTHMC2p1f.cc
index 1e914e87..6a18466e 100644
--- a/HMC/FTHMC2p1f.cc
+++ b/HMC/FTHMC2p1f.cc
@@ -24,7 +24,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
@@ -230,3 +234,4 @@ int main(int argc, char **argv)
 #endif
 } // main
 
+#endif
diff --git a/HMC/FTHMC2p1f_3GeV.cc b/HMC/FTHMC2p1f_3GeV.cc
index 36d5caa3..db2b937b 100644
--- a/HMC/FTHMC2p1f_3GeV.cc
+++ b/HMC/FTHMC2p1f_3GeV.cc
@@ -25,7 +25,11 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
@@ -231,5 +235,4 @@ int main(int argc, char **argv)
 #endif
 } // main
 
-
-
+#endif
diff --git a/HMC/HMC2p1f_3GeV.cc b/HMC/HMC2p1f_3GeV.cc
index 199d4be8..d97b03c6 100644
--- a/HMC/HMC2p1f_3GeV.cc
+++ b/HMC/HMC2p1f_3GeV.cc
@@ -24,7 +24,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
@@ -230,5 +234,4 @@ int main(int argc, char **argv)
 #endif
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1f.cc b/HMC/Mobius2p1f.cc
index 8042d6e6..b6b9da06 100644
--- a/HMC/Mobius2p1f.cc
+++ b/HMC/Mobius2p1f.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 int main(int argc, char **argv) {
   using namespace Grid;
@@ -195,5 +199,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1fEOFA.cc b/HMC/Mobius2p1fEOFA.cc
index c961cbc9..ce7518d9 100644
--- a/HMC/Mobius2p1fEOFA.cc
+++ b/HMC/Mobius2p1fEOFA.cc
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
@@ -107,7 +111,7 @@ NAMESPACE_BEGIN(Grid);
       //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
       // Assumption made in code to extract gauge field
       // We could avoid storing LinopD reference alltogether ?
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       ////////////////////////////////////////////////////////////////////////////////////
       // Must snarf a single precision copy of the gauge field in Linop_d argument
@@ -449,5 +453,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1fEOFA_F1.cc b/HMC/Mobius2p1fEOFA_F1.cc
index f910d69e..e9f2bb3b 100644
--- a/HMC/Mobius2p1fEOFA_F1.cc
+++ b/HMC/Mobius2p1fEOFA_F1.cc
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
@@ -107,7 +111,7 @@ NAMESPACE_BEGIN(Grid);
       //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
       // Assumption made in code to extract gauge field
       // We could avoid storing LinopD reference alltogether ?
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       ////////////////////////////////////////////////////////////////////////////////////
       // Must snarf a single precision copy of the gauge field in Linop_d argument
@@ -442,5 +446,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc b/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
index 35ec2246..3fe94313 100644
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 using namespace Grid;
 
@@ -176,7 +180,7 @@ void computeEigenvalues(std::string param_file,
   SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
   PlainHermOp<FermionFieldD> hermop_wrap(hermop);
   //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
-  assert(params.mu == 0.0);
+  GRID_ASSERT(params.mu == 0.0);
 
   Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
   FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
@@ -202,7 +206,7 @@ template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr, int action_or_md){
-  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
+  GRID_ASSERT(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
   
   FermionFieldD gauss_o(rbGrid);
   FermionFieldD gauss(Grid);
@@ -291,13 +295,13 @@ public:
   EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
 
   typedef typename FermionImplPolicy::FermionField Field;
-  void OpDiag (const Field &in, Field &out){ assert(0); }
-  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+  void OpDiag (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ GRID_ASSERT(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ GRID_ASSERT(0); } 
 
-  void Op     (const Field &in, Field &out){ assert(0); }
-  void AdjOp  (const Field &in, Field &out){ assert(0); }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void Op     (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void AdjOp  (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ GRID_ASSERT(0); }
   void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
 };
 
@@ -322,13 +326,13 @@ public:
   EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
 
   typedef typename FermionImplPolicy::FermionField Field;
-  void OpDiag (const Field &in, Field &out){ assert(0); }
-  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+  void OpDiag (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ GRID_ASSERT(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ GRID_ASSERT(0); } 
 
-  void Op     (const Field &in, Field &out){ assert(0); }
-  void AdjOp  (const Field &in, Field &out){ assert(0); }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void Op     (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void AdjOp  (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ GRID_ASSERT(0); }
   void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
 };
 
@@ -400,7 +404,7 @@ NAMESPACE_BEGIN(Grid);
       std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
 
       SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       precisionChange(FermOpF.Umu, FermOpD.Umu);
 
@@ -466,7 +470,7 @@ NAMESPACE_BEGIN(Grid);
       std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
 
       SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       precisionChange(FermOpF.Umu, FermOpD.Umu);
 
@@ -508,14 +512,14 @@ int main(int argc, char **argv) {
   while(i < argc){
     std::string sarg(argv[i]);
     if(sarg == "--param_file"){
-      assert(i!=argc-1);
+      GRID_ASSERT(i!=argc-1);
       param_file = argv[i+1];
       i+=2;
     }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
       file_load_check = true;
       i++;
     }else if(sarg == "--set_seeds"){ //set the rng seeds. Expects two vector args, e.g.  --set_seeds 1.2.3.4 5.6.7.8
-      assert(i < argc-2);
+      GRID_ASSERT(i < argc-2);
       std::vector<int> tmp;
       GridCmdOptionIntVector(argv[i+1],tmp);
       {
@@ -713,7 +717,7 @@ int main(int argc, char **argv) {
   std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
   std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
   int n_light_hsb = 5;
-  assert(user_params.eofa_l.size() == n_light_hsb);
+  GRID_ASSERT(user_params.eofa_l.size() == n_light_hsb);
   
   EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
 
@@ -837,42 +841,42 @@ int main(int argc, char **argv) {
   for(int i=1;i<argc;i++){
     std::string sarg(argv[i]);
     if(sarg == "--tune_rhmc_s"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       tune_rhmc_s=true;
       tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
     }
     else if(sarg == "--eigenrange_s"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       eigenrange_s=true;
       lanc_params_s = argv[i+1];
     }
     else if(sarg == "--tune_rhmc_DSDR"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       tune_rhmc_DSDR=true;
       tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
     }
     else if(sarg == "--eigenrange_DSDR"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       eigenrange_DSDR=true;
       lanc_params_DSDR = argv[i+1];
     }
     else if(sarg == "--check_eofa"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       check_eofa = true;
       eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
-      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
+      GRID_ASSERT(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
     }
     else if(sarg == "--upper_bound_eofa"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       upper_bound_eofa = true;
       eofa_which_hsb = std::stoi(argv[i+1]);
-      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+      GRID_ASSERT(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
     }
     else if(sarg == "--lower_bound_eofa"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       lower_bound_eofa = true;      
       eofa_which_hsb = std::stoi(argv[i+1]);
-      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+      GRID_ASSERT(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
     }
   }
   if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
@@ -918,3 +922,5 @@ int main(int argc, char **argv) {
   return 0;
 #endif
 } // main
+
+#endif
diff --git a/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc b/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
index 004a0953..d8a50f6b 100644
--- a/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
+++ b/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 using namespace Grid;
 
@@ -176,7 +180,7 @@ void computeEigenvalues(std::string param_file,
   SchurDiagMooeeOperator<FermionActionD, FermionFieldD> hermop(action);
   PlainHermOp<FermionFieldD> hermop_wrap(hermop);
   //ChebyshevLanczos<FermionFieldD> Cheb(params.alpha, params.beta, params.mu, params.ord);
-  assert(params.mu == 0.0);
+  GRID_ASSERT(params.mu == 0.0);
 
   Chebyshev<FermionFieldD> Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1);
   FunctionHermOp<FermionFieldD> Cheb_wrap(Cheb, hermop);
@@ -202,7 +206,7 @@ template<typename FermionActionD, typename FermionFieldD, typename RHMCtype>
 void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt,  //expect lattice to have been initialized to something
 	       FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng,
 	       int inv_pow, const std::string &quark_descr, int action_or_md){
-  assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
+  GRID_ASSERT(action_or_md == 0 || action_or_md == 1 || action_or_md == 2);
   
   FermionFieldD gauss_o(rbGrid);
   FermionFieldD gauss(Grid);
@@ -291,13 +295,13 @@ public:
   EOFAlinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
 
   typedef typename FermionImplPolicy::FermionField Field;
-  void OpDiag (const Field &in, Field &out){ assert(0); }
-  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+  void OpDiag (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ GRID_ASSERT(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ GRID_ASSERT(0); } 
 
-  void Op     (const Field &in, Field &out){ assert(0); }
-  void AdjOp  (const Field &in, Field &out){ assert(0); }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void Op     (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void AdjOp  (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ GRID_ASSERT(0); }
   void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); }
 };
 
@@ -322,13 +326,13 @@ public:
   EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){}
 
   typedef typename FermionImplPolicy::FermionField Field;
-  void OpDiag (const Field &in, Field &out){ assert(0); }
-  void OpDir  (const Field &in, Field &out,int dir,int disp){ assert(0); }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){ assert(0); } 
+  void OpDiag (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void OpDir  (const Field &in, Field &out,int dir,int disp){ GRID_ASSERT(0); }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){ GRID_ASSERT(0); } 
 
-  void Op     (const Field &in, Field &out){ assert(0); }
-  void AdjOp  (const Field &in, Field &out){ assert(0); }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
+  void Op     (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void AdjOp  (const Field &in, Field &out){ GRID_ASSERT(0); }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ GRID_ASSERT(0); }
   void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); }
 };
 
@@ -400,7 +404,7 @@ NAMESPACE_BEGIN(Grid);
       std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
 
       SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       precisionChange(FermOpF.Umu, FermOpD.Umu);
 
@@ -465,7 +469,7 @@ NAMESPACE_BEGIN(Grid);
       std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<<std::endl;
 
       SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       precisionChange(FermOpF.Umu, FermOpD.Umu);
 
@@ -502,7 +506,7 @@ int main(int argc, char **argv) {
   for(int i=1;i<argc;i++){
     std::string sarg(argv[i]);
     if(sarg == "--param_file"){
-      assert(i!=argc-1);
+      GRID_ASSERT(i!=argc-1);
       param_file = argv[i+1];
     }else if(sarg == "--read_check"){ //check the fields load correctly and pass checksum/plaquette repro
       file_load_check = true;
@@ -665,7 +669,7 @@ int main(int argc, char **argv) {
   std::vector<RealD> eofa_light_masses = { light_mass ,  0.004,   0.016,   0.064,   0.256    };
   std::vector<RealD> eofa_pv_masses =    { 0.004       , 0.016,   0.064,   0.256,   1.0      };
   int n_light_hsb = 5;
-  assert(user_params.eofa_l.size() == n_light_hsb);
+  GRID_ASSERT(user_params.eofa_l.size() == n_light_hsb);
   
   EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb];
 
@@ -791,42 +795,42 @@ int main(int argc, char **argv) {
   for(int i=1;i<argc;i++){
     std::string sarg(argv[i]);
     if(sarg == "--tune_rhmc_s"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       tune_rhmc_s=true;
       tune_rhmc_s_action_or_md = std::stoi(argv[i+1]);
     }
     else if(sarg == "--eigenrange_s"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       eigenrange_s=true;
       lanc_params_s = argv[i+1];
     }
     else if(sarg == "--tune_rhmc_DSDR"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       tune_rhmc_DSDR=true;
       tune_rhmc_DSDR_action_or_md = std::stoi(argv[i+1]);
     }
     else if(sarg == "--eigenrange_DSDR"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       eigenrange_DSDR=true;
       lanc_params_DSDR = argv[i+1];
     }
     else if(sarg == "--check_eofa"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       check_eofa = true;
       eofa_which_hsb = std::stoi(argv[i+1]); //-1 indicates all hasenbusch
-      assert(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
+      GRID_ASSERT(eofa_which_hsb == -1 || (eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb) );
     }
     else if(sarg == "--upper_bound_eofa"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       upper_bound_eofa = true;
       eofa_which_hsb = std::stoi(argv[i+1]);
-      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+      GRID_ASSERT(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
     }
     else if(sarg == "--lower_bound_eofa"){
-      assert(i < argc-1);
+      GRID_ASSERT(i < argc-1);
       lower_bound_eofa = true;      
       eofa_which_hsb = std::stoi(argv[i+1]);
-      assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
+      GRID_ASSERT(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb);
     }
   }
   if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) {
@@ -873,3 +877,5 @@ int main(int argc, char **argv) {
   return 0;
 #endif
 } // main
+
+#endif
diff --git a/HMC/Mobius2p1fRHMC.cc b/HMC/Mobius2p1fRHMC.cc
index 288a6c54..c49995b8 100644
--- a/HMC/Mobius2p1fRHMC.cc
+++ b/HMC/Mobius2p1fRHMC.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 int main(int argc, char **argv) {
   using namespace Grid;
@@ -193,5 +197,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_3level.cc b/HMC/Mobius2p1f_DD_EOFA_96I_3level.cc
index c305567c..1aa22332 100644
--- a/HMC/Mobius2p1f_DD_EOFA_96I_3level.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_3level.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 NAMESPACE_BEGIN(Grid);
 
@@ -95,7 +99,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
       //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
       // Assumption made in code to extract gauge field
       // We could avoid storing LinopD reference alltogether ?
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       ////////////////////////////////////////////////////////////////////////////////////
       // Must snarf a single precision copy of the gauge field in Linop_d argument
@@ -512,5 +516,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
index 678df981..1c477cb2 100644
--- a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 int main(int argc, char **argv) {
   using namespace Grid;
@@ -345,5 +349,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc b/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
index 83f20b92..44a153d2 100644
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 NAMESPACE_BEGIN(Grid);
 
@@ -95,7 +99,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
       //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
       // Assumption made in code to extract gauge field
       // We could avoid storing LinopD reference alltogether ?
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       ////////////////////////////////////////////////////////////////////////////////////
       // Must snarf a single precision copy of the gauge field in Linop_d argument
@@ -516,5 +520,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
index c69c70c7..d2dd3b9e 100644
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 NAMESPACE_BEGIN(Grid);
 
@@ -95,7 +99,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
       //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
       // Assumption made in code to extract gauge field
       // We could avoid storing LinopD reference alltogether ?
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       ////////////////////////////////////////////////////////////////////////////////////
       // Must snarf a single precision copy of the gauge field in Linop_d argument
@@ -567,5 +571,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1f_DD_RHMC.cc b/HMC/Mobius2p1f_DD_RHMC.cc
index 39b4c1dd..a95b5b2e 100644
--- a/HMC/Mobius2p1f_DD_RHMC.cc
+++ b/HMC/Mobius2p1f_DD_RHMC.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 int main(int argc, char **argv) {
   using namespace Grid;
@@ -263,5 +267,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1f_DD_RHMC_96I.cc b/HMC/Mobius2p1f_DD_RHMC_96I.cc
index c28296a3..d2d10d2d 100644
--- a/HMC/Mobius2p1f_DD_RHMC_96I.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 int main(int argc, char **argv) {
   using namespace Grid;
@@ -417,5 +421,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
index 36528c9f..5a1bdb15 100644
--- a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
+++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 NAMESPACE_BEGIN(Grid);
 
@@ -95,7 +99,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
       //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
       // Assumption made in code to extract gauge field
       // We could avoid storing LinopD reference alltogether ?
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       ////////////////////////////////////////////////////////////////////////////////////
       // Must snarf a single precision copy of the gauge field in Linop_d argument
@@ -452,5 +456,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1f_EOFA_96I_hmc.cc b/HMC/Mobius2p1f_EOFA_96I_hmc.cc
index 91f0bd95..b4999fc3 100644
--- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 NAMESPACE_BEGIN(Grid);
 
@@ -95,7 +99,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
       //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
       // Assumption made in code to extract gauge field
       // We could avoid storing LinopD reference alltogether ?
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       ////////////////////////////////////////////////////////////////////////////////////
       // Must snarf a single precision copy of the gauge field in Linop_d argument
@@ -462,5 +466,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc b/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
index 61b32b5c..201866a0 100644
--- a/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
+++ b/HMC/Mobius2p1f_EOFA_96I_hmc_double.cc
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>
 
 
 
@@ -264,5 +268,4 @@ int main(int argc, char **argv) {
   Grid_finalize();
 } // main
 
-
-
+#endif
diff --git a/HMC/disable_examples_without_instantiations.h b/HMC/disable_examples_without_instantiations.h
new file mode 100644
index 00000000..79dbe9c6
--- /dev/null
+++ b/HMC/disable_examples_without_instantiations.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#ifndef BUILD_FERMION_INSTANTIATIONS
+#include <iostream>
+
+int main(void) {
+  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
+	    << "which this example relies on. "
+	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
+	    << "to run this example."
+	    << std::endl;
+  return 1;
+}
+#endif
diff --git a/MPI_benchmark/halo_mpi.cc b/MPI_benchmark/halo_mpi.cc
index 9e11c473..b9b08c14 100644
--- a/MPI_benchmark/halo_mpi.cc
+++ b/MPI_benchmark/halo_mpi.cc
@@ -47,7 +47,7 @@ void *acceleratorAllocDevice(size_t bytes)
 {
   void *ptr=NULL;
   auto err = cudaMalloc((void **)&ptr,bytes);
-  assert(err==cudaSuccess);
+  GRID_ASSERT(err==cudaSuccess);
   return ptr;
 }
 void acceleratorFreeDevice(void *ptr){  cudaFree(ptr);}
diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc
index 01325c53..8474c4b2 100644
--- a/benchmarks/Benchmark_IO.cc
+++ b/benchmarks/Benchmark_IO.cc
@@ -37,7 +37,7 @@ void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
   Eigen::MatrixXd sqSum(nr, nc);
   double          n = static_cast<double>(data.size());
 
-  assert(n > 1.);
+  GRID_ASSERT(n > 1.);
   mean  = Mat::Zero(nr, nc);
   sqSum = Mat::Zero(nr, nc);
   for (auto &d: data)
diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index c42136b6..64255b99 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace Grid;
@@ -325,7 +328,7 @@ public:
     // Set/Get the layout & grid size
     ///////////////////////////////////////////////////////
     int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
     Coordinate local({L,L,L,L});
     Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
 
@@ -492,7 +495,7 @@ public:
     // Set/Get the layout & grid size
     ///////////////////////////////////////////////////////
     int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
     Coordinate local({L,L,L,L});
     Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
     
@@ -731,3 +734,5 @@ int main (int argc, char ** argv)
 
   Grid_finalize();
 }
+
+#endif
diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index 29772141..c28b2686 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -20,6 +20,9 @@
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -312,7 +315,7 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
       FGrid->Barrier();
       exit(-1);
     }
-    assert (n2e< 1.0e-4 );
+    GRID_ASSERT (n2e< 1.0e-4 );
   }
 
   if (1)
@@ -370,7 +373,7 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
   n2e= norm2(err);
   std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
 
-  assert((n2e)<1.0e-4);
+  GRID_ASSERT((n2e)<1.0e-4);
   
   FermionField src_e (FrbGrid);
   FermionField src_o (FrbGrid);
@@ -429,13 +432,14 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
   err = r_eo-result;
   n2e= norm2(err);
   std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
-  assert(n2e<1.0e-4);
+  GRID_ASSERT(n2e<1.0e-4);
 
   pickCheckerboard(Even,src_e,err);
   pickCheckerboard(Odd,src_o,err);
   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
 
-  assert(norm2(src_e)<1.0e-4);
-  assert(norm2(src_o)<1.0e-4);
+  GRID_ASSERT(norm2(src_e)<1.0e-4);
+  GRID_ASSERT(norm2(src_o)<1.0e-4);
 }
+#endif
diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc
index 583def29..20a695f5 100644
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -20,6 +20,9 @@
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -312,7 +315,7 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
       FGrid->Barrier();
       exit(-1);
     }
-    assert (n2e< 1.0e-4 );
+    GRID_ASSERT (n2e< 1.0e-4 );
   }
 
   if (1)
@@ -370,7 +373,7 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
   n2e= norm2(err);
   std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
 
-  assert((n2e)<1.0e-4);
+  GRID_ASSERT((n2e)<1.0e-4);
   
   FermionField src_e (FrbGrid);
   FermionField src_o (FrbGrid);
@@ -429,13 +432,15 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
   err = r_eo-result;
   n2e= norm2(err);
   std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
-  assert(n2e<1.0e-4);
+  GRID_ASSERT(n2e<1.0e-4);
 
   pickCheckerboard(Even,src_e,err);
   pickCheckerboard(Odd,src_o,err);
   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
 
-  assert(norm2(src_e)<1.0e-4);
-  assert(norm2(src_o)<1.0e-4);
+  GRID_ASSERT(norm2(src_e)<1.0e-4);
+  GRID_ASSERT(norm2(src_o)<1.0e-4);
 }
+
+#endif
diff --git a/benchmarks/Benchmark_dwf_fp32_paranoid.cc b/benchmarks/Benchmark_dwf_fp32_paranoid.cc
index 20f23b60..67317313 100644
--- a/benchmarks/Benchmark_dwf_fp32_paranoid.cc
+++ b/benchmarks/Benchmark_dwf_fp32_paranoid.cc
@@ -20,6 +20,9 @@
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -204,7 +207,7 @@ int main (int argc, char ** argv)
       Dw.Dhop(src,result,0);
       err = ref-result;
       std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-      assert (norm2(err)< 1.0e-4 );
+      GRID_ASSERT (norm2(err)< 1.0e-4 );
     }
     double t1=usecond();
     FGrid->Barrier();
@@ -244,7 +247,7 @@ int main (int argc, char ** argv)
       FGrid->Barrier();
       exit(-1);
     }
-    assert (norm2(err)< 1.0e-4 );
+    GRID_ASSERT (norm2(err)< 1.0e-4 );
   }
 
   if (1)
@@ -380,8 +383,10 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
 
-  assert(norm2(src_e)<1.0e-4);
-  assert(norm2(src_o)<1.0e-4);
+  GRID_ASSERT(norm2(src_e)<1.0e-4);
+  GRID_ASSERT(norm2(src_o)<1.0e-4);
   Grid_finalize();
   exit(0);
 }
+
+#endif
diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc
index 2f11eb22..a3ccc9e3 100644
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -238,5 +241,4 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
   }
 }
 
-
-
+#endif
diff --git a/benchmarks/Benchmark_gparity.cc b/benchmarks/Benchmark_gparity.cc
index 421dd3cd..1cfa2127 100644
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@@ -1,3 +1,7 @@
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+
 #include <Grid/Grid.h>
 #include <sstream>
 using namespace std;
@@ -155,3 +159,4 @@ int main (int argc, char ** argv)
   Grid_finalize();
 }
 
+#endif
diff --git a/benchmarks/Benchmark_halo.cc b/benchmarks/Benchmark_halo.cc
index 43138e67..f95c29ad 100644
--- a/benchmarks/Benchmark_halo.cc
+++ b/benchmarks/Benchmark_halo.cc
@@ -20,6 +20,9 @@
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -129,3 +132,5 @@ int main (int argc, char ** argv)
   Grid_finalize();
   exit(0);
 }
+
+#endif
diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc
index 3920d5f7..3ad02abe 100644
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -187,7 +187,7 @@ int main (int argc, char ** argv)
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"
 	       <<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"
 	       <<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
-      assert(nn==nn);
+      GRID_ASSERT(nn==nn);
   }    
 
   Grid_finalize();
diff --git a/benchmarks/Benchmark_meson_field.cc b/benchmarks/Benchmark_meson_field.cc
index cca33716..baec94da 100644
--- a/benchmarks/Benchmark_meson_field.cc
+++ b/benchmarks/Benchmark_meson_field.cc
@@ -52,9 +52,9 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
   const int Nsimd = grid->Nsimd();
   int Nt     = grid->GlobalDimensions()[orthogdim];
 
-  assert(mat.size()==Lblock*Rblock);
+  GRID_ASSERT(mat.size()==Lblock*Rblock);
   for(int t=0;t<mat.size();t++){
-    assert(mat[t].size()==Nt);
+    GRID_ASSERT(mat[t].size()==Nt);
   }
 
   int fd=grid->_fdimensions[orthogdim];
@@ -181,9 +181,9 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
   int Nt     = grid->GlobalDimensions()[orthogdim];
   int Ngamma = gammas.size();
 
-  assert(mat.size()==Lblock*Rblock*Ngamma);
+  GRID_ASSERT(mat.size()==Lblock*Rblock*Ngamma);
   for(int t=0;t<mat.size();t++){
-    assert(mat[t].size()==Nt);
+    GRID_ASSERT(mat[t].size()==Nt);
   }
 
   int fd=grid->_fdimensions[orthogdim];
@@ -329,9 +329,9 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
   int Nt     = grid->GlobalDimensions()[orthogdim];
   int Ngamma = gammas.size();
 
-  assert(mat.size()==Lblock*Rblock*Ngamma);
+  GRID_ASSERT(mat.size()==Lblock*Rblock*Ngamma);
   for(int t=0;t<mat.size();t++){
-    assert(mat[t].size()==Nt);
+    GRID_ASSERT(mat[t].size()==Nt);
   }
 
   int fd=grid->_fdimensions[orthogdim];
@@ -472,9 +472,9 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
   int Ngamma = gammas.size();
   int Nmom   = mom.size();
 
-  assert(mat.size()==Lblock*Rblock*Ngamma*Nmom);
+  GRID_ASSERT(mat.size()==Lblock*Rblock*Ngamma*Nmom);
   for(int t=0;t<mat.size();t++){
-    assert(mat[t].size()==Nt);
+    GRID_ASSERT(mat[t].size()==Nt);
   }
 
   int fd=grid->_fdimensions[orthogdim];
diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc
index 54235752..d3d39126 100644
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -149,3 +152,5 @@ int main (int argc, char ** argv)
 
   Grid_finalize();
 }
+
+#endif
diff --git a/benchmarks/Benchmark_schur.cc b/benchmarks/Benchmark_schur.cc
index 8171998a..644a158c 100644
--- a/benchmarks/Benchmark_schur.cc
+++ b/benchmarks/Benchmark_schur.cc
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -172,5 +175,4 @@ void benchDw(std::vector<int> & latt4, int Ls)
   //  Dw.Report();
 }
 
-
-
+#endif
diff --git a/benchmarks/Benchmark_staggered.cc b/benchmarks/Benchmark_staggered.cc
index a2be7f62..65e04f27 100644
--- a/benchmarks/Benchmark_staggered.cc
+++ b/benchmarks/Benchmark_staggered.cc
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -110,3 +113,5 @@ int main (int argc, char ** argv)
 
   Grid_finalize();
 }
+
+#endif
diff --git a/benchmarks/Benchmark_staggeredF.cc b/benchmarks/Benchmark_staggeredF.cc
index f7beed2d..e0f4331f 100644
--- a/benchmarks/Benchmark_staggeredF.cc
+++ b/benchmarks/Benchmark_staggeredF.cc
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -112,3 +115,5 @@ int main (int argc, char ** argv)
 
   Grid_finalize();
 }
+
+#endif
diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc
index 1ca0a6ca..ee790343 100644
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -26,6 +26,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+
 #include <Grid/Grid.h>
 #include <Grid/algorithms/blas/BatchedBlas.h>
 
@@ -397,7 +401,7 @@ public:
     // Set/Get the layout & grid size
     ///////////////////////////////////////////////////////
     int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
     Coordinate local({L,L,L,L});
     Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
 
@@ -568,7 +572,7 @@ public:
     // Set/Get the layout & grid size
     ///////////////////////////////////////////////////////
     int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
     Coordinate local({L,L,L,L});
     Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
     
@@ -723,7 +727,7 @@ public:
     // Set/Get the layout & grid size
     ///////////////////////////////////////////////////////
     int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
     Coordinate local({L,L,L,L});
     Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
     
@@ -978,3 +982,5 @@ int main (int argc, char ** argv)
   Grid_finalize();
   fclose(FP);
 }
+
+#endif
diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc
index 4510388b..ac2b8ebd 100644
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -253,8 +256,10 @@ int main (int argc, char ** argv)
 
   // guard
   double err1 = norm2(err);
-  assert(fabs(err0) < 1.0e-3);
-  assert(fabs(err1) < 1.0e-3);
+  GRID_ASSERT(fabs(err0) < 1.0e-3);
+  GRID_ASSERT(fabs(err1) < 1.0e-3);
 
   Grid_finalize();
 }
+
+#endif
diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc
index 45a10b25..6f840d56 100644
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -19,6 +19,9 @@ Author: Richard Rollins <rprollins@users.noreply.github.com>
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -161,3 +164,5 @@ void bench_wilson_eo (
   double flops = (single_site_flops * volume * ncall)/2.0;
   std::cout << flops/(t1-t0) << "\t\t";
 }
+
+#endif
diff --git a/benchmarks/disable_benchmarks_without_instantiations.h b/benchmarks/disable_benchmarks_without_instantiations.h
new file mode 100644
index 00000000..c93fd0a9
--- /dev/null
+++ b/benchmarks/disable_benchmarks_without_instantiations.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#ifndef BUILD_FERMION_INSTANTIATIONS
+#include <iostream>
+
+int main(void) {
+  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
+	    << "which this benchmark relies on. "
+	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
+	    << "to run this benchmark."
+	    << std::endl;
+  return 1;
+}
+#endif
diff --git a/configure.ac b/configure.ac
index 84e8f46e..dce70db1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -172,6 +172,12 @@ case ${ac_TRACING} in
 esac
 
 ############### fermions
+AC_ARG_ENABLE([fermion-instantiations],
+     [AS_HELP_STRING([--enable-fermion-instantiations=yes|no],[enable fermion instantiations])],
+     [ac_FERMION_REPS=${enable_fermion_instantiations}], [ac_FERMION_INSTANTIATIONS=yes])
+
+AM_CONDITIONAL(BUILD_FERMION_INSTANTIATIONS, [ test "${ac_FERMION_INSTANTIATIONS}X" == "yesX" ])
+
 AC_ARG_ENABLE([fermion-reps],
      [AS_HELP_STRING([--enable-fermion-reps=yes|no],[enable extra fermion representation support])],
      [ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes])
@@ -292,13 +298,14 @@ AC_ARG_ENABLE([accelerator],
 case ${ac_ACCELERATOR} in
     cuda)
       echo CUDA acceleration
-      LIBS="${LIBS} -lcuda"
+      LIBS="${LIBS} -lcuda -lcublas -lcufft"
       AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);;
     sycl)
       echo SYCL acceleration
       AC_DEFINE([GRID_SYCL],[1],[Use SYCL offload]);;
     hip)
       echo HIP acceleration
+      LIBS="${LIBS} -lhipblas -lrocblas -lhipfft"
       AC_DEFINE([GRID_HIP],[1],[Use HIP offload]);;
     none)
       echo NO acceleration    ;;
diff --git a/examples/Example_Laplacian_solver.cc b/examples/Example_Laplacian_solver.cc
index 9a584c0f..77313052 100644
--- a/examples/Example_Laplacian_solver.cc
+++ b/examples/Example_Laplacian_solver.cc
@@ -41,7 +41,7 @@ void SimpleConjugateGradient(LinearOperatorBase<Field> &HPDop,const Field &b, Fi
         return;
       }
     }
-    assert(0);
+    GRID_ASSERT(0);
 }
 
 
diff --git a/examples/Example_Mobius_spectrum.cc b/examples/Example_Mobius_spectrum.cc
index b604eec4..51609c69 100644
--- a/examples/Example_Mobius_spectrum.cc
+++ b/examples/Example_Mobius_spectrum.cc
@@ -3,6 +3,9 @@
  * without regression / tests being applied
  */
 
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -115,8 +118,8 @@ void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,Lat
 }
 void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
 {
-  assert(mom.size()==Nd);
-  assert(mom[Tdir] == 0);
+  GRID_ASSERT(mom.size()==Nd);
+  GRID_ASSERT(mom[Tdir] == 0);
 
   GridBase * grid = spectator.Grid();
 
@@ -310,5 +313,4 @@ int main (int argc, char ** argv)
   Grid_finalize();
 }
 
-
-
+#endif
diff --git a/examples/Example_christoph.cc b/examples/Example_christoph.cc
index f8d83fa4..eb9a9041 100644
--- a/examples/Example_christoph.cc
+++ b/examples/Example_christoph.cc
@@ -3,6 +3,9 @@
  * without regression / tests being applied
  */
 
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -119,8 +122,8 @@ void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,Lat
 }
 void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
 {
-  assert(mom.size()==Nd);
-  assert(mom[Tdir] == 0);
+  GRID_ASSERT(mom.size()==Nd);
+  GRID_ASSERT(mom[Tdir] == 0);
 
   GridBase * grid = spectator.Grid();
 
@@ -432,5 +435,4 @@ int main (int argc, char ** argv)
   Grid_finalize();
 }
 
-
-
+#endif
diff --git a/examples/Example_spec_kryschur.cc b/examples/Example_spec_kryschur.cc
index 7e70a180..e521ff3c 100644
--- a/examples/Example_spec_kryschur.cc
+++ b/examples/Example_spec_kryschur.cc
@@ -65,13 +65,12 @@
 using namespace std;
 using namespace Grid;
 
-<<<<<<< HEAD
 namespace Grid {
 
 struct LanczosParameters: Serializable {
   GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
-		  		RealD, mass , 
-		  		RealD, mstep , 
+		  		RealD, mass ,
+		  		RealD, mstep ,
 				Integer, Nstop,
                                 Integer, Nk,
                                 Integer, Np,
@@ -90,7 +89,7 @@ struct LanczosParameters: Serializable {
     initialize(TheReader);
   }
 
-  template < class ReaderClass > 
+  template < class ReaderClass >
   void initialize(Reader<ReaderClass> &TheReader){
     read(TheReader, "HMC", *this);
   }
@@ -104,38 +103,36 @@ struct LanczosParameters: Serializable {
 //    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
 //    MD.print_parameters();
   }
-  
+
 };
 
 }
 
-#if 0
-=======
-template <class T> void writeFile(T& in, std::string const fname){  
-  #ifdef HAVE_LIME
-    // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
-    std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
-    Grid::emptyUserRecord record;
-    Grid::ScidacWriter WR(in.Grid()->IsBoss());
-    WR.open(fname);
-    WR.writeScidacFieldRecord(in,record,0); // Lexico
-    WR.close();
-  #endif
+template <class T> void writeFile(T& in, std::string const fname){
+#ifdef HAVE_LIME
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0); // Lexico
+  WR.close();
+#endif
 }
 
 /**
- * Writes the eigensystem of a Krylov Schur object to a directory. 
- * 
+ * Writes the eigensystem of a Krylov Schur object to a directory.
+ *
  * Parameters
  * ----------
  * std::string path
- *    Directory to write to. 
+ *    Directory to write to.
  */
 template <class Field>
 void writeEigensystem(KrylovSchur<Field> KS, std::string outDir) {
   int Nk = KS.getNk();
   std::cout << GridLogMessage << "Writing output to directory: " << outDir << std::endl;
-  
+
   // Write evals
   std::string evalPath = outDir + "/evals.txt";
   std::ofstream fEval;
@@ -148,7 +145,7 @@ void writeEigensystem(KrylovSchur<Field> KS, std::string outDir) {
     if (i < Nk - 1) { fEval << "\n"; }
   }
   fEval.close();
-  
+
   // Write evecs (TODO: very heavy on storage costs! Don't write them all out)
   // std::vector<Field> evecs = KS.getEvecs();
   // for (int i = 0; i < Nk; i++) {
@@ -156,8 +153,6 @@ void writeEigensystem(KrylovSchur<Field> KS, std::string outDir) {
   //   writeFile(evecs[i], fName);     // using method from Grid/HMC/ComputeWilsonFlow.cc
   // }
 }
-
->>>>>>> 68af1bba67dd62881ead5ab1e54962a5486a0791
 // Hermitize a DWF operator by squaring it
 template<class Matrix,class Field>
 class SquaredLinearOperator : public LinearOperatorBase<Field> {
diff --git a/examples/Example_wall_wall_3pt.cc b/examples/Example_wall_wall_3pt.cc
index ac8d44bb..c60ed02b 100644
--- a/examples/Example_wall_wall_3pt.cc
+++ b/examples/Example_wall_wall_3pt.cc
@@ -3,6 +3,9 @@
  * without regression / tests being applied
  */
 
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -157,8 +160,8 @@ void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,Lat
 }
 void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
 {
-  assert(mom.size()==Nd);
-  assert(mom[Tdir] == 0);
+  GRID_ASSERT(mom.size()==Nd);
+  GRID_ASSERT(mom[Tdir] == 0);
 
   GridBase * grid = spectator.Grid();
 
@@ -331,7 +334,7 @@ void WallSinkMesonTrace(std::string file,std::vector<Propagator> &q1,std::vector
 int make_idx(int p, int m,int nmom)
 {
   if (m==0) return p;
-  assert(p==0);
+  GRID_ASSERT(p==0);
   return nmom + m - 1;
 }
 
@@ -535,5 +538,4 @@ int main (int argc, char ** argv)
   Grid_finalize();
 }
 
-
-
+#endif
diff --git a/examples/Example_wall_wall_spectrum.cc b/examples/Example_wall_wall_spectrum.cc
index d4914213..b8b1a0d7 100644
--- a/examples/Example_wall_wall_spectrum.cc
+++ b/examples/Example_wall_wall_spectrum.cc
@@ -3,6 +3,9 @@
  * without regression / tests being applied
  */
 
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -157,8 +160,8 @@ void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,Lat
 }
 void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
 {
-  assert(mom.size()==Nd);
-  assert(mom[Tdir] == 0);
+  GRID_ASSERT(mom.size()==Nd);
+  GRID_ASSERT(mom[Tdir] == 0);
 
   GridBase * grid = spectator.Grid();
 
@@ -429,5 +432,4 @@ int main (int argc, char ** argv)
   Grid_finalize();
 }
 
-
-
+#endif
diff --git a/examples/disable_examples_without_instantiations.h b/examples/disable_examples_without_instantiations.h
new file mode 100644
index 00000000..79dbe9c6
--- /dev/null
+++ b/examples/disable_examples_without_instantiations.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#ifndef BUILD_FERMION_INSTANTIATIONS
+#include <iostream>
+
+int main(void) {
+  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
+	    << "which this example relies on. "
+	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
+	    << "to run this example."
+	    << std::endl;
+  return 1;
+}
+#endif
diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command
index 500b2866..02d57c0d 100644
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -2,7 +2,7 @@ export MPFR=`spack find --paths mpfr    | grep ^mpfr  | awk '{print $2}' `
 export GMP=`spack find --paths gmp   | grep ^gmp  | awk '{print $2}' `
 export CLIME=`spack find --paths c-lime  | grep ^c-lime  | awk '{print $2}' `
 export UNWIND=`spack find --paths libunwind  | grep ^libunwind  | awk '{print $2}' `
-
+export HDF5=/opt/cray/pe/hdf5/1.12.2.3/gnu/9.1
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
@@ -10,12 +10,13 @@ export UNWIND=`spack find --paths libunwind  | grep ^libunwind  | awk '{print $2
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
-	--enable-checksum-comms=yes \
+	--enable-checksum-comms=no \
 	--enable-log-views=yes \
 	--enable-accelerator=sycl \
 	--enable-accelerator-aware-mpi=no \
 	--enable-unified=no \
 	--with-lime=$CLIME \
+	--with-hdf5=${HDF5} \
 	--with-gmp=$GMP \
 	--with-mpfr=$MPFR \
 	--with-unwind=$UNWIND \
diff --git a/systems/Aurora/tests/reproBigJob.pbs b/systems/Aurora/tests/reproBigJob.pbs
index 1d880f0d..d455c590 100644
--- a/systems/Aurora/tests/reproBigJob.pbs
+++ b/systems/Aurora/tests/reproBigJob.pbs
@@ -1,30 +1,21 @@
 #!/bin/bash
 
 #PBS -l select=32
-#PBS -q EarlyAppAccess
+#PBS -q prod
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=02:00:00
 #PBS -N reproBigJob
 #PBS -k doe
+#PBS -l filesystems=flare
+#PBS -l filesystems=home
 
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 
-#module load oneapi/eng-compiler/2023.05.15.003
-#module load mpich/51.2/icc-all-deterministic-pmix-gpu
 
 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
-export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
-export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
-#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 
-#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
@@ -33,17 +24,12 @@ export GRID_PRINT_ENTIRE_LOG=0
 export GRID_CHECKSUM_RECV_BUF=0
 export GRID_CHECKSUM_SEND_BUF=0
 
-export MPICH_OFI_NIC_POLICY=GPU
-
-#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
-#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
-#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
-#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
-#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
-#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 
 cd $PBS_O_WORKDIR
 
+source ../source.sh
+
+
 cp $PBS_NODEFILE nodefile
 
 DIR=reproBigJob.$PBS_JOBID
@@ -58,17 +44,17 @@ BINARY=../Test_dwf_mixedcg_prec
 echo > pingjob <<EOF
 while read node ; 
 do
-	echo ssh $node killall -s USR1 -- ../Test_dwf_mixedcg_prec
+	echo ssh $node killall -HUP Test_dwf_mixedcg_prec
 done < nodefile
 EOF
 
 CMD="mpiexec -np 384 -ppn 12  -envall --hostfile nodefile \
-	     ../gpu_tile_compact.sh \
-	     $BINARY --mpi 4.4.4.6 --grid 128.128.128.96  \
-		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals"
+	     ../gpu_tile.sh \
+	     $BINARY --mpi 4.4.4.6 --grid 64.64.64.96  \
+		--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message "
 
 echo $CMD > command-line
 env > environment
 $CMD
-grep Oops Grid.stderr.* > failures.$PBS_JOBID
-rm core.*
+grep Oops */Grid.stderr.* > failures.$PBS_JOBID
+
diff --git a/systems/Frontier/config-command b/systems/Frontier/config-command
index 8a3fdcfd..7561fb15 100644
--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@@ -10,12 +10,11 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --disable-fermion-reps \
 --enable-simd=GPU \
 --with-gmp=$OLCF_GMP_ROOT \
---with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
- LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
+ LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft"
 
 
 
diff --git a/tests/IO/Test_field_array_io.cc b/tests/IO/Test_field_array_io.cc
index 51ea7893..b186b42f 100644
--- a/tests/IO/Test_field_array_io.cc
+++ b/tests/IO/Test_field_array_io.cc
@@ -103,7 +103,7 @@ template<typename FieldType>
 void readFieldArray(std::vector<FieldType> &data, const std::string &file){
   typedef typename FieldType::vector_object vobj;
   typedef typename FieldType::scalar_object sobj;
-  assert(data.size() > 0);
+  GRID_ASSERT(data.size() > 0);
   GridBase* grid = data[0].Grid(); //assume all fields have the same Grid
   BinarySimpleUnmunger<sobj, sobj> munge; //straight copy
   
@@ -113,7 +113,7 @@ void readFieldArray(std::vector<FieldType> &data, const std::string &file){
   
   std::cout << "Data offset read " << offset << std::endl;  
   std::cout << "Data size read " << hdr_size << std::endl;
-  assert(data.size() == hdr_size);
+  GRID_ASSERT(data.size() == hdr_size);
 
   uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj);
 
@@ -132,7 +132,7 @@ void readFieldArray(std::vector<FieldType> &data, const std::string &file){
   std::cout << "Read checksum " << checksum << std::endl;
     
 
-  assert( hdr_checksum == checksum );
+  GRID_ASSERT( hdr_checksum == checksum );
 }
 
 
diff --git a/tests/IO/Test_openqcd_io.cc b/tests/IO/Test_openqcd_io.cc
index 765509a9..e9797e77 100644
--- a/tests/IO/Test_openqcd_io.cc
+++ b/tests/IO/Test_openqcd_io.cc
@@ -61,7 +61,7 @@ int main(int argc, char** argv) {
   if(GridCmdOptionExists(argv, argv + argc, "--config")) {
     file = GridCmdOptionPayload(argv, argv + argc, "--config");
     std::cout << "file: " << file << std::endl;
-    assert(!file.empty());
+    GRID_ASSERT(!file.empty());
   }
 
   OpenQcdIOChromaReference::readConfiguration(Umu_ref, header_ref, file);
diff --git a/tests/Test_cayley_even_odd_vec.cc b/tests/Test_cayley_even_odd_vec.cc
index 243d1f72..f0e1b4de 100644
--- a/tests/Test_cayley_even_odd_vec.cc
+++ b/tests/Test_cayley_even_odd_vec.cc
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -273,8 +276,6 @@ void  TestWhat(What & Ddwf,
 
   err = phi-chi;
   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<< std::endl;
-
-  
 }
 
-
+#endif
diff --git a/tests/Test_compressed_lanczos_hot_start.cc b/tests/Test_compressed_lanczos_hot_start.cc
index fc7775dd..bb180220 100644
--- a/tests/Test_compressed_lanczos_hot_start.cc
+++ b/tests/Test_compressed_lanczos_hot_start.cc
@@ -30,6 +30,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
  *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
  *  in Grid that were intended to be used to support blocked Aggregates, from
  */
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
@@ -56,7 +59,7 @@ public:
   void checkpointFine(std::string evecs_file,std::string evals_file)
   {
 #ifdef HAVE_LIME
-    assert(this->subspace.size()==nbasis);
+    GRID_ASSERT(this->subspace.size()==nbasis);
     emptyUserRecord record;
     Grid::ScidacWriter WR(this->_FineGrid->IsBoss());
     WR.open(evecs_file);
@@ -68,7 +71,7 @@ public:
     XmlWriter WRx(evals_file);
     write(WRx,"evals",this->evals_fine);
 #else
-    assert(0);
+    GRID_ASSERT(0);
 #endif
   }
 
@@ -82,7 +85,7 @@ public:
     XmlReader RDx(evals_file);
     read(RDx,"evals",this->evals_fine);
     
-    assert(this->evals_fine.size()==nbasis);
+    GRID_ASSERT(this->evals_fine.size()==nbasis);
     
     std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
     emptyUserRecord record;
@@ -95,7 +98,7 @@ public:
     }
     RD.close();
 #else
-    assert(0);
+    GRID_ASSERT(0);
 #endif 
   }
 
@@ -114,7 +117,7 @@ public:
     XmlWriter WRx(evals_file);
     write(WRx,"evals",this->evals_coarse);
 #else
-    assert(0);
+    GRID_ASSERT(0);
 #endif
   }
 
@@ -128,7 +131,7 @@ public:
     XmlReader RDx(evals_file);
     read(RDx,"evals",this->evals_coarse);
 
-    assert(this->evals_coarse.size()==nvec);
+    GRID_ASSERT(this->evals_coarse.size()==nvec);
     emptyUserRecord record;
     std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
     Grid::ScidacReader RD ;
@@ -138,7 +141,7 @@ public:
     }
     RD.close();
 #else 
-    assert(0);
+    GRID_ASSERT(0);
 #endif
   }
 };
@@ -179,19 +182,19 @@ int main (int argc, char ** argv) {
 
   std::vector<int> fineLatt     = latt;
   int dims=fineLatt.size();
-  assert(blockSize.size()==dims+1);
+  GRID_ASSERT(blockSize.size()==dims+1);
   std::vector<int> coarseLatt(dims);
   std::vector<int> coarseLatt5d ;
 
   for (int d=0;d<coarseLatt.size();d++){
-    coarseLatt[d] = fineLatt[d]/blockSize[d];    assert(coarseLatt[d]*blockSize[d]==fineLatt[d]);
+    coarseLatt[d] = fineLatt[d]/blockSize[d];    GRID_ASSERT(coarseLatt[d]*blockSize[d]==fineLatt[d]);
   }
 
   std::cout << GridLogMessage<< " 5d coarse lattice is ";
   for (int i=0;i<coarseLatt.size();i++){
     std::cout << coarseLatt[i]<<"x";
   } 
-  int cLs = Ls/blockSize[dims]; assert(cLs*blockSize[dims]==Ls);
+  int cLs = Ls/blockSize[dims]; GRID_ASSERT(cLs*blockSize[dims]==Ls);
   std::cout << cLs<<std::endl;
   
   GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
@@ -222,14 +225,14 @@ int main (int argc, char ** argv) {
 
   std::cout << GridLogMessage << "Keep " << fine.Nstop   << " fine   vectors" << std::endl;
   std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl;
-  assert(Nm2 >= Nm1);
+  GRID_ASSERT(Nm2 >= Nm1);
 
   const int nbasis= 60;
-  assert(nbasis==Ns1);
+  GRID_ASSERT(nbasis==Ns1);
   LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,HermOp,Odd);
   std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
 
-  assert( (Params.doFine)||(Params.doFineRead));
+  GRID_ASSERT( (Params.doFine)||(Params.doFineRead));
 
   if ( Params.doFine ) { 
     std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "<<Nk1<<" Nm "<<Nm1<< std::endl;
@@ -256,3 +259,4 @@ int main (int argc, char ** argv) {
   Grid_finalize();
 }
 
+#endif
diff --git a/tests/Test_dwf_dslash_repro.cc b/tests/Test_dwf_dslash_repro.cc
index b0eac64b..57a18b00 100644
--- a/tests/Test_dwf_dslash_repro.cc
+++ b/tests/Test_dwf_dslash_repro.cc
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -237,3 +240,5 @@ int main (int argc, char ** argv)
   
   Grid_finalize();
 }
+
+#endif
diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc
index 97bf5143..e5771adc 100644
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -180,7 +183,7 @@ int main (int argc, char ** argv)
     std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
     std::cout << " SinglePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
 
-    assert(FlightRecorder::ErrorCount()==0);
+    GRID_ASSERT(FlightRecorder::ErrorCount()==0);
 
     std::cout << " FlightRecorder is OK! "<<std::endl;
     iter ++;
@@ -208,7 +211,7 @@ int main (int argc, char ** argv)
     std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
     std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
     std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
-    assert(FlightRecorder::ErrorCount()==0);
+    GRID_ASSERT(FlightRecorder::ErrorCount()==0);
     std::cout << " FlightRecorder is OK! "<<std::endl;
     now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
     i++;
@@ -218,7 +221,9 @@ int main (int argc, char ** argv)
   RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
 
   std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
-  assert(diff < 1e-4);
+  GRID_ASSERT(diff < 1e-4);
   
   Grid_finalize();
 }
+
+#endif
diff --git a/tests/Test_dwf_mixedcg_prec_halfcomms.cc b/tests/Test_dwf_mixedcg_prec_halfcomms.cc
index ff52b0d1..7a59f77d 100644
--- a/tests/Test_dwf_mixedcg_prec_halfcomms.cc
+++ b/tests/Test_dwf_mixedcg_prec_halfcomms.cc
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 
 using namespace std;
@@ -118,3 +121,4 @@ int main (int argc, char ** argv)
   Grid_finalize();
 }
 #endif
+#endif
diff --git a/tests/Test_innerproduct_norm.cc b/tests/Test_innerproduct_norm.cc
index a8718c6b..dc4d6c0c 100644
--- a/tests/Test_innerproduct_norm.cc
+++ b/tests/Test_innerproduct_norm.cc
@@ -84,8 +84,8 @@ int main(int argc, char** argv) {
     std::cout << GridLogMessage << "Double: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl;
     // clang-format on
 
-    assert(diff_ip_d == 0.);
-    assert(diff_norm2_d == 0.);
+    GRID_ASSERT(diff_ip_d == 0.);
+    GRID_ASSERT(diff_norm2_d == 0.);
 
     std::cout << GridLogMessage << "Double: all checks passed" << std::endl;
   }
@@ -116,8 +116,8 @@ int main(int argc, char** argv) {
     std::cout << GridLogMessage << "Single: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl;
     // clang-format on
 
-    assert(diff_ip_f == 0.);
-    assert(diff_norm2_f == 0.);
+    GRID_ASSERT(diff_ip_f == 0.);
+    GRID_ASSERT(diff_norm2_f == 0.);
 
     std::cout << GridLogMessage << "Single: all checks passed" << std::endl;
   }
diff --git a/tests/Test_meson_field.cc b/tests/Test_meson_field.cc
index fa428d6a..5f85047f 100644
--- a/tests/Test_meson_field.cc
+++ b/tests/Test_meson_field.cc
@@ -24,6 +24,8 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
 
 #include <Grid/Grid.h>
 #include <Grid/qcd/utils/A2Autils.h>
@@ -157,3 +159,5 @@ int main(int argc, char *argv[])
   
   return EXIT_SUCCESS;
 }
+
+#endif
diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc
index 16205ee1..d8cd1dd6 100644
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -175,7 +175,7 @@ void Tester(const functor &func)
   } else {
     std::cout<<GridLogMessage << " wrong!" <<std::endl;
   }
-  assert(ok==0);
+  GRID_ASSERT(ok==0);
 }
 
 template<class functor>
@@ -234,7 +234,7 @@ void IntTester(const functor &func)
   } else {
     std::cout<<GridLogMessage << " wrong!" <<std::endl;
   }
-  assert(ok==0);
+  GRID_ASSERT(ok==0);
 }
 
 
@@ -285,7 +285,7 @@ void ReductionTester(const functor &func)
   } else {
     std::cout<<GridLogMessage << " wrong!" <<std::endl;
   }
-  assert(ok==0);
+  GRID_ASSERT(ok==0);
 }
 
 
@@ -332,7 +332,7 @@ void IntReductionTester(const functor &func)
   } else {
     std::cout<<GridLogMessage << " wrong!" <<std::endl;
   }
-  assert(ok==0);
+  GRID_ASSERT(ok==0);
 }
 
 
@@ -456,7 +456,7 @@ void PermTester(const functor &func)
   } else {
     std::cout<<GridLogMessage << " wrong!" <<std::endl;
   }
-  assert(ok==0);
+  GRID_ASSERT(ok==0);
 }
 
 
@@ -520,8 +520,8 @@ void ExchangeTester(const functor &func)
 	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference1[j]<<" "<<result1[i]<<std::endl;
       }
     }
-    //    assert(found==1);
-    assert(found==1||found==0);
+    //    GRID_ASSERT(found==1);
+    GRID_ASSERT(found==1||found==0);
   }
   for(int i=0;i<Nsimd;i++){
     int found=0;
@@ -531,8 +531,8 @@ void ExchangeTester(const functor &func)
 	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference2[j]<<" "<<result2[i]<<std::endl;
       }
     }
-    //    assert(found==1);
-    assert(found==1||found==0);
+    //    GRID_ASSERT(found==1);
+    GRID_ASSERT(found==1||found==0);
   }
 
   /*
@@ -547,8 +547,8 @@ void ExchangeTester(const functor &func)
   }
   */
   for(int i=0;i<Nsimd;i++){
-    assert(test1[i]==input1[i]);
-    assert(test2[i]==input2[i]);
+    GRID_ASSERT(test1[i]==input1[i]);
+    GRID_ASSERT(test2[i]==input2[i]);
   }
 }
 
@@ -789,7 +789,7 @@ int main (int argc, char ** argv)
       nrm = innerProduct(DD[i],DD[i]);
       auto tmp = Reduce(nrm);
       //      std::cout << tmp << std::endl;
-      assert( tmp < 1.0e-14 );
+      GRID_ASSERT( tmp < 1.0e-14 );
     }
     std::cout <<" OK ! "<<std::endl;
 
@@ -805,7 +805,7 @@ int main (int argc, char ** argv)
       nrm = innerProduct(DD[i],DD[i]);
       auto tmp = Reduce(nrm);
       //      std::cout << tmp << std::endl;
-      assert( tmp < 1.0e-3 );
+      GRID_ASSERT( tmp < 1.0e-3 );
     }
     std::cout <<" OK ! "<<std::endl;
 
@@ -820,7 +820,7 @@ int main (int argc, char ** argv)
       nrm = innerProduct(FF[i],FF[i]);
       auto tmp = Reduce(nrm);
       //      std::cout << tmp << std::endl;
-      assert( tmp < 1.0e-3 );
+      GRID_ASSERT( tmp < 1.0e-3 );
     }
     std::cout <<" OK ! "<<std::endl;
 #endif
diff --git a/tests/core/Test_checker.cc b/tests/core/Test_checker.cc
index c2382e91..6b7def0c 100644
--- a/tests/core/Test_checker.cc
+++ b/tests/core/Test_checker.cc
@@ -54,7 +54,7 @@ int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
-  assert(argc >= 5);
+  GRID_ASSERT(argc >= 5);
   
   Coordinate latt(4,0);
   latt[0] = toint(argv[1]);
diff --git a/tests/core/Test_compact_wilson_clover_speedup.cc b/tests/core/Test_compact_wilson_clover_speedup.cc
index d09b4c1d..8aa15026 100644
--- a/tests/core/Test_compact_wilson_clover_speedup.cc
+++ b/tests/core/Test_compact_wilson_clover_speedup.cc
@@ -205,7 +205,7 @@ void runBenchmark(int* argc, char*** argv) {
   double secs_res = (t5-t4)/1e6; \
   grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", \
                   "compact_"#KERNEL, precision.c_str(), secs_res, clov_gflop_total/secs_res, clov_gbyte_total/secs_res, secs_ref/secs_res, secs_res/secs_hop); \
-  assert(resultsAgree(ref, res, #KERNEL)); \
+  GRID_ASSERT(resultsAgree(ref, res, #KERNEL)); \
 }
 
   BENCH_CLOVER_KERNEL(Mooee);
diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc
index 16ee5a0f..b048e31a 100644
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Grid.h>
 
 using namespace Grid;
- ;
 
 int main (int argc, char ** argv)
 {
@@ -116,10 +115,10 @@ int main (int argc, char ** argv)
 
   Stilde=S;
   std::cout<<" Benchmarking FFT of LatticeSpinMatrix  "<<std::endl;
-  theFFT.FFT_dim(Stilde,S,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<std::endl;
 
   SpinMatrixD Sp; 
   Sp = Zero(); Sp = Sp+cVol;
@@ -202,11 +201,16 @@ int main (int argc, char ** argv)
     FFT theFFT5(FGrid);
       
     theFFT5.FFT_dim(result5,tmp5,1,FFT::forward); tmp5 = result5;
+    std::cout<<"Fourier xformed Ddwf 1 "<<norm2(result5)<<std::endl;
     theFFT5.FFT_dim(result5,tmp5,2,FFT::forward); tmp5 = result5;
+    std::cout<<"Fourier xformed Ddwf 2 "<<norm2(result5)<<std::endl;
     theFFT5.FFT_dim(result5,tmp5,3,FFT::forward); tmp5 = result5;
-    theFFT5.FFT_dim(result5,tmp5,4,FFT::forward); result5 = result5*ComplexD(::sqrt(1.0/vol),0.0);
+    std::cout<<"Fourier xformed Ddwf 3 "<<norm2(result5)<<std::endl;
+    theFFT5.FFT_dim(result5,tmp5,4,FFT::forward); 
+    std::cout<<"Fourier xformed Ddwf 4 "<<norm2(result5)<<std::endl;
+    result5 = result5*ComplexD(::sqrt(1.0/vol),0.0);
     
-    std::cout<<"Fourier xformed Ddwf"<<std::endl;
+    std::cout<<"Fourier xformed Ddwf "<<norm2(result5)<<std::endl;
     
     tmp5 = src5;
     theFFT5.FFT_dim(src5_p,tmp5,1,FFT::forward); tmp5 = src5_p;
@@ -214,7 +218,7 @@ int main (int argc, char ** argv)
     theFFT5.FFT_dim(src5_p,tmp5,3,FFT::forward); tmp5 = src5_p;
     theFFT5.FFT_dim(src5_p,tmp5,4,FFT::forward); src5_p = src5_p*ComplexD(::sqrt(1.0/vol),0.0);
 
-    std::cout<<"Fourier xformed src5"<<std::endl;
+    std::cout<<"Fourier xformed src5"<< norm2(src5)<<" -> "<<norm2(src5_p)<<std::endl;
       
     /////////////////////////////////////////////////////////////////
     // work out the predicted from Fourier
@@ -251,7 +255,8 @@ int main (int argc, char ** argv)
       Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src5_p);
       
     }
-    
+    std::cout << " src5    "<<norm2(src5_p)<<std::endl;
+    std::cout << " Kinetic "<<norm2(Kinetic)<<std::endl;
     // NB implicit sum over mu
     //
     // 1-1/2 Dw = 1 - 1/2 ( eip+emip)
@@ -260,18 +265,23 @@ int main (int argc, char ** argv)
     //          = 2 sink/2 ink/2 = sk2
     
     W = one - M5 + sk2; 
+    std::cout << " W "<<norm2(W)<<std::endl;
     Kinetic = Kinetic + W * src5_p;
+
+    std::cout << " Kinetic "<<norm2(Kinetic)<<std::endl;
     
     LatticeCoordinate(scoor,sdir);
     
     tmp5 = Cshift(src5_p,sdir,+1);
     tmp5 = (tmp5 - G5*tmp5)*0.5;
     tmp5 = where(scoor==Integer(Ls-1),mass*tmp5,-tmp5);
+    std::cout << " tmp5 "<<norm2(tmp5)<<std::endl;
     Kinetic = Kinetic + tmp5;
     
     tmp5 = Cshift(src5_p,sdir,-1);
     tmp5 = (tmp5 + G5*tmp5)*0.5;
     tmp5 = where(scoor==Integer(0),mass*tmp5,-tmp5);
+    std::cout << " tmp5 "<<norm2(tmp5)<<std::endl;
     Kinetic = Kinetic + tmp5;
     
     std::cout<<"Momentum space Ddwf  "<< norm2(Kinetic)<<std::endl;
@@ -279,7 +289,7 @@ int main (int argc, char ** argv)
     
     result5 = result5 - Kinetic;
     std::cout<<"diff "<< norm2(result5)<<std::endl;
-    assert(norm2(result5)<1.0e-4);
+    GRID_ASSERT(norm2(result5)<1.0e-4);
     
   }
 
@@ -339,7 +349,7 @@ int main (int argc, char ** argv)
     Ddwf.Mdag(src5,tmp5);
     src5=tmp5;
     MdagMLinearOperator<DomainWallFermionD,LatticeFermionD> HermOp(Ddwf);
-    ConjugateGradient<LatticeFermionD> CG(1.0e-16,10000);
+    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
     CG(HermOp,src5,result5);
     
     ////////////////////////////////////////////////////////////////////////
@@ -358,7 +368,7 @@ int main (int argc, char ** argv)
     
     diff = ref - result4;
     std::cout << "result - ref     "<<norm2(diff)<<std::endl;
-    assert(norm2(diff)<1.0e-4);
+    GRID_ASSERT(norm2(diff)<1.0e-4);
 
   }
 
@@ -423,7 +433,7 @@ int main (int argc, char ** argv)
     Dov.Mdag(src5,tmp5);
     src5=tmp5;
     MdagMLinearOperator<OverlapWilsonCayleyTanhFermionD,LatticeFermionD> HermOp(Dov);
-    ConjugateGradient<LatticeFermionD> CG(1.0e-16,10000);
+    ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
     CG(HermOp,src5,result5);
     
     ////////////////////////////////////////////////////////////////////////
@@ -442,7 +452,7 @@ int main (int argc, char ** argv)
     
     diff = ref - result4;
     std::cout << "result - ref     "<<norm2(diff)<<std::endl;
-    assert(norm2(diff)<1.0e-4);
+    GRID_ASSERT(norm2(diff)<1.0e-4);
 
   }
 
diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc
index 6d617e25..6375f1d3 100644
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -196,7 +196,7 @@ int main (int argc, char ** argv)
   for(int i=1;i<argc;i++){
     std::string sarg(argv[i]);
     if(sarg == "--gimpl"){
-      assert(i<argc-1 && "--gimpl option requires an argument");
+      GRID_ASSERT(i<argc-1 && "--gimpl option requires an argument");
       gimpl = argv[i+1];
       if(gimpl != "periodic" && gimpl != "conjugate")
 	assert(0 && "Invalid gimpl");
@@ -206,7 +206,7 @@ int main (int argc, char ** argv)
       std::cout << "Not doing the Fourier accelerated gauge fixing tests" << std::endl;
       do_fft_gfix = false;
     }else if(sarg == "--alpha"){
-      assert(i<argc-1 && "--alpha option requires an argument");
+      GRID_ASSERT(i<argc-1 && "--alpha option requires an argument");
       std::istringstream ss(argv[i+1]); ss >> alpha;
     }
   }
diff --git a/tests/core/Test_fft_prop.cc b/tests/core/Test_fft_prop.cc
new file mode 100644
index 00000000..39961242
--- /dev/null
+++ b/tests/core/Test_fft_prop.cc
@@ -0,0 +1,101 @@
+    /*************************************************************************************
+
+    grid` physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+
+template<class LatticeObject>
+void bench(GridCartesian *grid, std::string name)
+{
+  LatticeComplexD       C(grid);
+  LatticeComplexD       coor(grid);
+
+  ComplexD ci(0.0,1.0);
+  Coordinate p({1,2,3,4});
+
+  Coordinate latt_size   = grid->_fdimensions;
+  std::cout<<"*************************************************"<<std::endl;
+  std::cout<<" Benchmarking FFT of "<<name<<" on plane wave    "<<std::endl;
+  std::cout<<"*************************************************"<<std::endl;
+  C=Zero();
+  for(int mu=0;mu<4;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    C = C + (TwoPiL * p[mu]) * coor;
+  }
+  C = exp(C*ci);
+
+  LatticeObject S(grid);
+  LatticeObject Stilde(grid);
+
+  S=Zero();
+  S = S+C;
+
+  FFT theFFT(grid);
+
+  Stilde=S;
+  std::cout << " norm2(s) "<<norm2(Stilde)<<std::endl;
+  double tt= -usecond();
+  theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<norm2(Stilde)<<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<norm2(Stilde)<<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<norm2(Stilde)<<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" mflops "<<norm2(Stilde)<<std::endl;
+  tt+= usecond();
+
+  std::cout<<"*************************************************"<<std::endl;
+  std::cout<<" FFT of "<<latt_size <<" "<<name<<" took "<<tt/1.e6<<" s"<<std::endl;
+  std::cout<<"*************************************************"<<std::endl;
+
+}
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
+  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
+
+
+  bench<LatticeComplexD>(&GRID,std::string("LatticeComplexD"));
+  bench<LatticeColourMatrixD>(&GRID,std::string("LatticeColourMatrixD"));
+  bench<LatticePropagatorD>(&GRID,std::string("LatticePropagatorD"));
+  
+  Grid_finalize();
+}
diff --git a/tests/core/Test_lie_generators.cc b/tests/core/Test_lie_generators.cc
index 7792b743..2250747e 100644
--- a/tests/core/Test_lie_generators.cc
+++ b/tests/core/Test_lie_generators.cc
@@ -171,7 +171,7 @@ int main(int argc, char** argv) {
     Complex tr2 = TRa()()(b,c) * Complex(0,1);
     std::cout << " 2 Tr( Tc[Ta,Tb]) " << 2.0*tr1<<std::endl;
     std::cout << " - TRa_bc " << tr2<<std::endl;
-    assert(abs( (2.0*tr1-tr2) ) < 1.0e-7);
+    GRID_ASSERT(abs( (2.0*tr1-tr2) ) < 1.0e-7);
     std::cout << "------------------"<<std::endl;
   }}}
 #endif
diff --git a/tests/core/Test_memory_manager.cc b/tests/core/Test_memory_manager.cc
index 2d31751c..1baf8642 100644
--- a/tests/core/Test_memory_manager.cc
+++ b/tests/core/Test_memory_manager.cc
@@ -93,13 +93,13 @@ void  MemoryTest(GridCartesian         * FGrid, int N)
 	if ( dev ) { 
 	  autoView(A_v,A[v],AcceleratorRead);
 	  accelerator_for(ss,FGrid->oSites(),1,{
-	      //	      assert(B[v]==A_v[ss]()()().getlane(0));
+	      //	      GRID_ASSERT(B[v]==A_v[ss]()()().getlane(0));
 	    });
 	  //	std::cout << "["<<v<<"] checked on GPU"<<B[v]<<std::endl;
 	} else {
 	  autoView(A_v,A[v],CpuRead);
 	  thread_for(ss,FGrid->oSites(),{
-	      assert(B[v]==A_v[ss]()()().getlane(0));
+	      GRID_ASSERT(B[v]==A_v[ss]()()().getlane(0));
 	    });
 	  //	std::cout << "["<<v<<"] checked on CPU"<<B[v]<<std::endl;
 	}
diff --git a/tests/core/Test_sliceSum.cc b/tests/core/Test_sliceSum.cc
index 8ee05c14..16f30f26 100644
--- a/tests/core/Test_sliceSum.cc
+++ b/tests/core/Test_sliceSum.cc
@@ -11,13 +11,13 @@ template<class vobj> inline void sliceSumCPU(const Grid::Lattice<vobj> &Data,std
   typedef typename vobj::scalar_object sobj;
   typedef typename vobj::scalar_object::scalar_type scalar_type;
   GridBase  *grid = Data.Grid();
-  assert(grid!=NULL);
+  GRID_ASSERT(grid!=NULL);
 
   const int    Nd = grid->_ndimension;
   const int Nsimd = grid->Nsimd();
 
-  assert(orthogdim >= 0);
-  assert(orthogdim < Nd);
+  GRID_ASSERT(orthogdim >= 0);
+  GRID_ASSERT(orthogdim < Nd);
 
   int fd=grid->_fdimensions[orthogdim];
   int ld=grid->_ldimensions[orthogdim];
@@ -134,7 +134,7 @@ int main (int argc, char ** argv) {
       for(int t=0;t<reduction_reference.size();t++) {
 
         auto diff = reduction_reference[t]-reduction_result[t];
-        assert(abs(TensorRemove(diff)) < 1e-8 );
+        GRID_ASSERT(abs(TensorRemove(diff)) < 1e-8 );
 
       }
 
@@ -184,10 +184,10 @@ int main (int argc, char ** argv) {
       for(int t=0;t<reduction_reference_cv.size();t++) {
 
         auto diff = reduction_reference_cv[t]-reduction_result_cv[t];
-        assert(abs(diff()(0)()) < 1e-8 );
-        assert(abs(diff()(1)()) < 1e-8 );
-        assert(abs(diff()(2)()) < 1e-8 );
-        assert(abs(diff()(3)()) < 1e-8 );
+        GRID_ASSERT(abs(diff()(0)()) < 1e-8 );
+        GRID_ASSERT(abs(diff()(1)()) < 1e-8 );
+        GRID_ASSERT(abs(diff()(2)()) < 1e-8 );
+        GRID_ASSERT(abs(diff()(3)()) < 1e-8 );
 
       }
 
@@ -238,18 +238,18 @@ int main (int argc, char ** argv) {
 
         auto diff = reduction_reference_scv[t]-reduction_result_scv[t];
         // std::cout << diff <<std::endl;
-        assert(abs(diff()(0)(0)) < 1e-8 );
-        assert(abs(diff()(0)(1)) < 1e-8 );
-        assert(abs(diff()(0)(2)) < 1e-8 );
-        assert(abs(diff()(1)(0)) < 1e-8 );
-        assert(abs(diff()(1)(1)) < 1e-8 );
-        assert(abs(diff()(1)(2)) < 1e-8 );    
-        assert(abs(diff()(2)(0)) < 1e-8 );
-        assert(abs(diff()(2)(1)) < 1e-8 );
-        assert(abs(diff()(2)(2)) < 1e-8 );    
-        assert(abs(diff()(3)(0)) < 1e-8 );
-        assert(abs(diff()(3)(1)) < 1e-8 );
-        assert(abs(diff()(3)(2)) < 1e-8 );
+        GRID_ASSERT(abs(diff()(0)(0)) < 1e-8 );
+        GRID_ASSERT(abs(diff()(0)(1)) < 1e-8 );
+        GRID_ASSERT(abs(diff()(0)(2)) < 1e-8 );
+        GRID_ASSERT(abs(diff()(1)(0)) < 1e-8 );
+        GRID_ASSERT(abs(diff()(1)(1)) < 1e-8 );
+        GRID_ASSERT(abs(diff()(1)(2)) < 1e-8 );    
+        GRID_ASSERT(abs(diff()(2)(0)) < 1e-8 );
+        GRID_ASSERT(abs(diff()(2)(1)) < 1e-8 );
+        GRID_ASSERT(abs(diff()(2)(2)) < 1e-8 );    
+        GRID_ASSERT(abs(diff()(3)(0)) < 1e-8 );
+        GRID_ASSERT(abs(diff()(3)(1)) < 1e-8 );
+        GRID_ASSERT(abs(diff()(3)(2)) < 1e-8 );
 
       }
 
@@ -304,7 +304,7 @@ int main (int argc, char ** argv) {
           for (int js = 0; js < Ns; js++) {
             for (int ic = 0; ic < Nc; ic++) {
               for (int jc = 0; jc < Nc; jc++) {
-                assert(abs(diff()(is,js)(ic,jc)) < 1e-8);
+                GRID_ASSERT(abs(diff()(is,js)(ic,jc)) < 1e-8);
               }
             }
           }
diff --git a/tests/core/Test_uvm.cc b/tests/core/Test_uvm.cc
index 290aa975..695980d3 100644
--- a/tests/core/Test_uvm.cc
+++ b/tests/core/Test_uvm.cc
@@ -77,11 +77,11 @@ public:
 	  ComplexD ref = B[v][p*PageWords];
 	  std::cout << "Device compare "<<B[v][p*PageWords]<<std::endl;
 	  accelerator_for(ss,1,1,{
-	      assert(ref==A_v[p*PageWords]);
+	      GRID_ASSERT(ref==A_v[p*PageWords]);
 	    });
 	} else {
 	  std::cout << "Host compare "<<B[v][p*PageWords]<<std::endl;
-	  assert(B[v][p*PageWords]==A[v][p*PageWords]);
+	  GRID_ASSERT(B[v][p*PageWords]==A[v][p*PageWords]);
 	}
       }
     }
diff --git a/tests/core/Test_where_extended.cc b/tests/core/Test_where_extended.cc
index 9862b3ed..17a2c0a0 100644
--- a/tests/core/Test_where_extended.cc
+++ b/tests/core/Test_where_extended.cc
@@ -77,7 +77,7 @@ int main (int argc, char ** argv)
 	ns=ns+norm2(sl);
       }
       std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" err " << nn-ns<<std::endl;
-      assert(abs(nn-ns) < 1.0e-10);
+      GRID_ASSERT(abs(nn-ns) < 1.0e-10);
     }
   }
 
@@ -105,7 +105,7 @@ int main (int argc, char ** argv)
 	ns=ns+norm2(sl);
       }
       std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" err " << nn-ns<<std::endl;
-      assert(abs(nn-ns) < 1.0e-10);
+      GRID_ASSERT(abs(nn-ns) < 1.0e-10);
     }
   }
 
@@ -135,7 +135,7 @@ int main (int argc, char ** argv)
 	ns=ns+norm2(sl);
       }
       std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" err " << nn-ns<<std::endl;
-      assert(abs(nn-ns) < 1.0e-10);
+      GRID_ASSERT(abs(nn-ns) < 1.0e-10);
     }
   }
 
diff --git a/tests/core/Test_wilson_clover.cc b/tests/core/Test_wilson_clover.cc
index 0ce0513f..a737040e 100644
--- a/tests/core/Test_wilson_clover.cc
+++ b/tests/core/Test_wilson_clover.cc
@@ -114,7 +114,7 @@ int main(int argc, char **argv)
 
   err = ref - r_eo;
   std::cout << GridLogMessage << "EO norm diff\t" << norm2(err) << " (" << norm2(ref) << " - " << norm2(r_eo) << ")" << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
 
 
@@ -129,7 +129,7 @@ int main(int argc, char **argv)
 
   err = ref - r_eo;
   std::cout << GridLogMessage << "EO norm diff compact\t" << norm2(err) << " (" << norm2(ref) << " - " << norm2(r_eo) << ")" << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
 
   std::cout << GridLogMessage << "==============================================================" << std::endl;
@@ -204,7 +204,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   Dwc_compact.Mooee(chi_e, src_e);
   Dwc_compact.MooeeInv(src_e, phi_e);
@@ -217,7 +217,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "==============================================================" << std::endl;
   std::cout << GridLogMessage << "= Test MeeDag MeeInvDag = 1    (if csw!=0)                    " << std::endl;
@@ -237,7 +237,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   Dwc_compact.MooeeDag(chi_e, src_e);
   Dwc_compact.MooeeInvDag(src_e, phi_e);
@@ -250,7 +250,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "==============================================================" << std::endl;
   std::cout << GridLogMessage << "= Test MeeInv MeeDag = 1      (if csw!=0)                     " << std::endl;
@@ -270,7 +270,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   Dwc_compact.MooeeDag(chi_e, src_e);
   Dwc_compact.MooeeInv(src_e, phi_e);
@@ -283,7 +283,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "================================================================" << std::endl;
   std::cout << GridLogMessage << "= Testing gauge covariance Clover term with EO preconditioning  " << std::endl;
@@ -339,7 +339,7 @@ int main(int argc, char **argv)
 
   err = chi - adj(Omega) * phi;
   std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   chi = Zero();
   phi = Zero();
@@ -368,7 +368,7 @@ int main(int argc, char **argv)
 
   err = chi - adj(Omega) * phi;
   std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "=================================================================" << std::endl;
   std::cout << GridLogMessage << "= Testing gauge covariance Clover term w/o EO preconditioning  " << std::endl;
@@ -389,10 +389,10 @@ int main(int argc, char **argv)
 
   err = result - adj(Omega) * result2;
   std::cout << GridLogMessage << "norm diff Wilson              " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
   err = chi - adj(Omega) * phi;
   std::cout << GridLogMessage << "norm diff WilsonClover        " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   chi = Zero();
   phi = Zero();
@@ -402,7 +402,7 @@ int main(int argc, char **argv)
 
   err = chi - adj(Omega) * phi;
   std::cout << GridLogMessage << "norm diff CompactWilsonClover " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "==========================================================" << std::endl;
   std::cout << GridLogMessage << "= Testing Mooee(csw=0) Clover to reproduce Mooee Wilson   " << std::endl;
@@ -432,7 +432,7 @@ int main(int argc, char **argv)
 
   err = chi - phi;
   std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   chi = Zero();
   phi = Zero();
@@ -458,7 +458,7 @@ int main(int argc, char **argv)
 
   err = chi - phi;
   std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "==========================================================" << std::endl;
   std::cout << GridLogMessage << "= Testing EO operator is equal to the unprec              " << std::endl;
@@ -493,7 +493,7 @@ int main(int argc, char **argv)
   std::cout << GridLogMessage << "ref (unpreconditioned operator) diff         : " << norm2(ref) << std::endl;
   std::cout << GridLogMessage << "phi (EO decomposition)          diff         : " << norm2(phi) << std::endl;
   std::cout << GridLogMessage << "norm diff                                    : " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   chi = Zero();
   phi = Zero();
@@ -524,7 +524,7 @@ int main(int argc, char **argv)
   std::cout << GridLogMessage << "ref (unpreconditioned operator) diff compact : " << norm2(ref) << std::endl;
   std::cout << GridLogMessage << "phi (EO decomposition)          diff compact : " << norm2(phi) << std::endl;
   std::cout << GridLogMessage << "norm diff compact                            : " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   Grid_finalize();
 }
diff --git a/tests/core/Test_wilson_conserved_current.cc b/tests/core/Test_wilson_conserved_current.cc
index c66bf940..9d9097e4 100644
--- a/tests/core/Test_wilson_conserved_current.cc
+++ b/tests/core/Test_wilson_conserved_current.cc
@@ -198,8 +198,8 @@ void  TestConserved(Action & Dw,
   std::cout<<GridLogMessage<<"Vector Ward identity by timeslice (~ 0)"<<std::endl;
   for(int t=0;t<Nt;t++){
     std::cout<<GridLogMessage <<" t "<<t<<" SV "<<real(TensorRemove(sumSV[t]))<<" VV "<<real(TensorRemove(sumVV[t]))<<std::endl;
-    assert(abs(real(TensorRemove(sumSV[t]))) < 1e-10);
-    assert(abs(real(TensorRemove(sumVV[t]))) < 1e-2);
+    GRID_ASSERT(abs(real(TensorRemove(sumSV[t]))) < 1e-10);
+    GRID_ASSERT(abs(real(TensorRemove(sumVV[t]))) < 1e-2);
   }
 
   ///////////////////////////////
@@ -245,9 +245,9 @@ void  TestConserved(Action & Dw,
 
     std::cout<<GridLogMessage << "Consistency check for sequential conserved " <<std::endl;
     std::cout<<GridLogMessage << "Diff S  = " << abs(check_S) << std::endl;
-    assert(abs(check_S) < 1e-8);
+    GRID_ASSERT(abs(check_S) < 1e-8);
     std::cout<<GridLogMessage << "Diff V  = " << abs(check_V) << std::endl;
-    assert(abs(check_V) < 1e-8);
+    GRID_ASSERT(abs(check_V) < 1e-8);
   }
 
 }
diff --git a/tests/core/Test_wilson_exp_clover.cc b/tests/core/Test_wilson_exp_clover.cc
index 017d8823..33f933d5 100644
--- a/tests/core/Test_wilson_exp_clover.cc
+++ b/tests/core/Test_wilson_exp_clover.cc
@@ -114,7 +114,7 @@ int main(int argc, char **argv)
 
   err = ref - r_eo;
   std::cout << GridLogMessage << "EO norm diff\t" << norm2(err) << " (" << norm2(ref) << " - " << norm2(r_eo) << ")" << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
 
 
@@ -129,7 +129,7 @@ int main(int argc, char **argv)
 
   err = ref - r_eo;
   std::cout << GridLogMessage << "EO norm diff compact\t" << norm2(err) << " (" << norm2(ref) << " - " << norm2(r_eo) << ")" << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
 
   std::cout << GridLogMessage << "==============================================================" << std::endl;
@@ -204,7 +204,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   Dwc_compact.Mooee(chi_e, src_e);
   Dwc_compact.MooeeInv(src_e, phi_e);
@@ -217,7 +217,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "==============================================================" << std::endl;
   std::cout << GridLogMessage << "= Test MeeDag MeeInvDag = 1    (if csw!=0)                    " << std::endl;
@@ -237,7 +237,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   Dwc_compact.MooeeDag(chi_e, src_e);
   Dwc_compact.MooeeInvDag(src_e, phi_e);
@@ -250,7 +250,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "==============================================================" << std::endl;
   std::cout << GridLogMessage << "= Test MeeInv MeeDag = 1      (if csw!=0)                     " << std::endl;
@@ -270,7 +270,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   Dwc_compact.MooeeDag(chi_e, src_e);
   Dwc_compact.MooeeInv(src_e, phi_e);
@@ -283,7 +283,7 @@ int main(int argc, char **argv)
 
   err = phi - chi;
   std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "================================================================" << std::endl;
   std::cout << GridLogMessage << "= Testing gauge covariance Clover term with EO preconditioning  " << std::endl;
@@ -339,7 +339,7 @@ int main(int argc, char **argv)
 
   err = chi - adj(Omega) * phi;
   std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   chi = Zero();
   phi = Zero();
@@ -368,7 +368,7 @@ int main(int argc, char **argv)
 
   err = chi - adj(Omega) * phi;
   std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "=================================================================" << std::endl;
   std::cout << GridLogMessage << "= Testing gauge covariance Clover term w/o EO preconditioning  " << std::endl;
@@ -389,10 +389,10 @@ int main(int argc, char **argv)
 
   err = result - adj(Omega) * result2;
   std::cout << GridLogMessage << "norm diff Wilson                 " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
   err = chi - adj(Omega) * phi;
   std::cout << GridLogMessage << "norm diff WilsonExpClover        " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   chi = Zero();
   phi = Zero();
@@ -402,7 +402,7 @@ int main(int argc, char **argv)
 
   err = chi - adj(Omega) * phi;
   std::cout << GridLogMessage << "norm diff CompactWilsonExpClover " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "==========================================================" << std::endl;
   std::cout << GridLogMessage << "= Testing Mooee(csw=0) Clover to reproduce Mooee Wilson   " << std::endl;
@@ -432,7 +432,7 @@ int main(int argc, char **argv)
 
   err = chi - phi;
   std::cout << GridLogMessage << "norm diff " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   chi = Zero();
   phi = Zero();
@@ -458,7 +458,7 @@ int main(int argc, char **argv)
 
   err = chi - phi;
   std::cout << GridLogMessage << "norm diff compact " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   std::cout << GridLogMessage << "==========================================================" << std::endl;
   std::cout << GridLogMessage << "= Testing EO operator is equal to the unprec              " << std::endl;
@@ -493,7 +493,7 @@ int main(int argc, char **argv)
   std::cout << GridLogMessage << "ref (unpreconditioned operator) diff         : " << norm2(ref) << std::endl;
   std::cout << GridLogMessage << "phi (EO decomposition)          diff         : " << norm2(phi) << std::endl;
   std::cout << GridLogMessage << "norm diff                                    : " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   chi = Zero();
   phi = Zero();
@@ -524,7 +524,7 @@ int main(int argc, char **argv)
   std::cout << GridLogMessage << "ref (unpreconditioned operator) diff compact : " << norm2(ref) << std::endl;
   std::cout << GridLogMessage << "phi (EO decomposition)          diff compact : " << norm2(phi) << std::endl;
   std::cout << GridLogMessage << "norm diff compact                            : " << norm2(err) << std::endl;
-  assert(fabs(norm2(err)) < tolerance);
+  GRID_ASSERT(fabs(norm2(err)) < tolerance);
 
   Grid_finalize();
 }
diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc
index 26d3dc60..7b40e57e 100644
--- a/tests/debug/Test_cayley_mres.cc
+++ b/tests/debug/Test_cayley_mres.cc
@@ -471,8 +471,8 @@ void  TestConserved1(Action & Ddwf, Action & Ddwfrev,
       // Mobius parameters
       auto b=Ddwf.bs[s];
       auto c=Ddwf.cs[s];
-      assert(Ddwfrev.bs[sr]==Ddwf.bs[s]);
-      assert(Ddwfrev.cs[sr]==Ddwf.cs[s]);
+      GRID_ASSERT(Ddwfrev.bs[sr]==Ddwf.bs[s]);
+      GRID_ASSERT(Ddwfrev.cs[sr]==Ddwf.cs[s]);
 
       LatticePropagator tmp(UGrid); 
 
diff --git a/tests/debug/Test_general_coarse.cc b/tests/debug/Test_general_coarse.cc
index 4351a901..c7bdbe09 100644
--- a/tests/debug/Test_general_coarse.cc
+++ b/tests/debug/Test_general_coarse.cc
@@ -48,16 +48,16 @@ class HermOpAdaptor : public LinearOperatorBase<Field>
   LinearOperatorBase<Field> & wrapped;
 public:
   HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
-  void OpDiag (const Field &in, Field &out) {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void OpDiag (const Field &in, Field &out) {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    GRID_ASSERT(0);  };
   void Op     (const Field &in, Field &out){
     wrapped.HermOp(in,out);
   }
   void AdjOp     (const Field &in, Field &out){
     wrapped.HermOp(in,out);
   }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
   void HermOp(const Field &in, Field &out){
     wrapped.HermOp(in,out);
   }
@@ -286,7 +286,7 @@ int main (int argc, char ** argv)
       chi=chi-Aphi;
       RealD diff =norm2(chi);
       std::cout << r << " diff " << diff<<std::endl;
-      assert(diff < 1.0e-10);
+      GRID_ASSERT(diff < 1.0e-10);
     }
     std::cout << nrhs<< " mrhs " << t0/ncall/nrhs <<" us"<<std::endl;
     std::cout << nrhs<< " srhs " << t1/ncall/nrhs <<" us"<<std::endl;
diff --git a/tests/debug/Test_general_coarse_hdcg.cc b/tests/debug/Test_general_coarse_hdcg.cc
index 0d6b0a64..9c6be62e 100644
--- a/tests/debug/Test_general_coarse_hdcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg.cc
@@ -43,10 +43,10 @@ public:
   void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
   void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
   void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
-  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void OpDiag (const Field &in, Field &out)                  {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    GRID_ASSERT(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
 };
 
 template<class Field> class CGSmoother : public LinearFunction<Field>
@@ -128,6 +128,10 @@ int main (int argc, char ** argv)
   typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
   HermFineMatrix FineHermOp(HermOpEO);
 
+  LatticeFermionD src(FrbGrid); 
+  src = ComplexD(1.0);
+  PowerMethod<LatticeFermionD>       PM;   PM(HermOpEO,src);
+
   ////////////////////////////////////////////////////////////
   ///////////// Coarse basis and Little Dirac Operator ///////
   ////////////////////////////////////////////////////////////
@@ -150,7 +154,7 @@ int main (int argc, char ** argv)
   std::cout << "**************************************"<<std::endl;
   std::cout << "Create Subspace"<<std::endl;
   std::cout << "**************************************"<<std::endl;
-  Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
+  Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,35.,0.01,500);// <== last run
 
   std::cout << "**************************************"<<std::endl;
   std::cout << "Refine Subspace"<<std::endl;
@@ -185,7 +189,7 @@ int main (int argc, char ** argv)
   std::cout << "**************************************"<<std::endl;
 
   typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
-  Chebyshev<CoarseVector>      IRLCheby(0.05,40.0,101);  // 1 iter
+  Chebyshev<CoarseVector>      IRLCheby(0.01,16.0,201);  // 1 iter
   MrhsHermMatrix MrhsCoarseOp     (mrhs);
 
   CoarseVector pm_src(CoarseMrhs);
@@ -193,10 +197,10 @@ int main (int argc, char ** argv)
   PowerMethod<CoarseVector>       cPM;
   cPM(MrhsCoarseOp,pm_src);
 
-  int Nk=nrhs;
-  int Nm=Nk*3;
-  //  int Nk=36;
-  //  int Nm=144;
+  //  int Nk=16;
+  //  int Nm=Nk*3;
+  int Nk=32;
+  int Nm=128;
   int Nstop=Nk;
   int Nconv_test_interval=1;
   
@@ -210,7 +214,7 @@ int main (int argc, char ** argv)
 							  nrhs,
 							  Nk,
 							  Nm,
-							  1e-4,10);
+							  1e-4,100);
 
   int Nconv;
   std::vector<RealD>            eval(Nm);
@@ -231,8 +235,6 @@ int main (int argc, char ** argv)
   std::cout << "**************************************"<<std::endl;
   std::cout << " Recompute coarse evecs  "<<std::endl;
   std::cout << "**************************************"<<std::endl;
-  evec.resize(Nm,Coarse5d);
-  eval.resize(Nm);
   for(int r=0;r<nrhs;r++){
     random(CRNG,c_src[r]);
   }
@@ -243,7 +245,7 @@ int main (int argc, char ** argv)
   // Deflation guesser object
   ///////////////////////
   std::cout << "**************************************"<<std::endl;
-  std::cout << " Reimport coarse evecs  "<<std::endl;
+  std::cout << " Reimport coarse evecs "<<evec.size()<<" "<<eval.size()<<std::endl;
   std::cout << "**************************************"<<std::endl;
   MultiRHSDeflation<CoarseVector> MrhsGuesser;
   MrhsGuesser.ImportEigenBasis(evec,eval);
@@ -252,9 +254,11 @@ int main (int argc, char ** argv)
   // Extra HDCG parameters
   //////////////////////////
   int maxit=3000;
-  ConjugateGradient<CoarseVector>  CG(2.0e-1,maxit,false);
-  RealD lo=2.0;
-  int ord = 9;
+  //  ConjugateGradient<CoarseVector>  CG(2.0e-1,maxit,false);
+  //  ConjugateGradient<CoarseVector>  CG(1.0e-2,maxit,false);
+  ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false);
+  RealD lo=0.2;
+  int ord = 7;
 
   DoNothingGuesser<CoarseVector> DoNothing;
   HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
@@ -300,6 +304,19 @@ int main (int argc, char ** argv)
     ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
     CGfine(HermOpEO, src, result);
   }
+  {
+    std::cout << "**************************************"<<std::endl;
+    std::cout << "Calling MdagM CG"<<std::endl;
+    std::cout << "**************************************"<<std::endl;
+      
+    LatticeFermion result(FGrid); result=Zero();
+    LatticeFermion    src(FGrid); random(RNG5,src);
+    result=Zero();
+
+    MdagMLinearOperator<MobiusFermionD, LatticeFermionD> HermOp(Ddwf);
+    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
+    CGfine(HermOp, src, result);
+  }
 #endif  
   Grid_finalize();
   return 0;
diff --git a/tests/debug/Test_general_coarse_hdcg_phys.cc b/tests/debug/Test_general_coarse_hdcg_phys.cc
index 3ec42fad..88118173 100644
--- a/tests/debug/Test_general_coarse_hdcg_phys.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys.cc
@@ -40,7 +40,7 @@ void SaveOperator(Coarsened &Operator,std::string file)
 #ifdef HAVE_LIME
   emptyUserRecord record;
   ScidacWriter WR(Operator.Grid()->IsBoss());
-  assert(Operator._A.size()==Operator.geom.npoint);
+  GRID_ASSERT(Operator._A.size()==Operator.geom.npoint);
   WR.open(file);
   for(int p=0;p<Operator._A.size();p++){
     auto tmp = Operator.Cell.Extract(Operator._A[p]);
@@ -57,7 +57,7 @@ void LoadOperator(Coarsened &Operator,std::string file)
   emptyUserRecord record;
   Grid::ScidacReader RD ;
   RD.open(file);
-  assert(Operator._A.size()==Operator.geom.npoint);
+  GRID_ASSERT(Operator._A.size()==Operator.geom.npoint);
   for(int p=0;p<Operator.geom.npoint;p++){
     conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
     //    RD.readScidacFieldRecord(Operator._A[p],record,BINARYIO_LEXICOGRAPHIC);
@@ -74,7 +74,7 @@ void ReLoadOperator(Coarsened &Operator,std::string file)
   emptyUserRecord record;
   Grid::ScidacReader RD ;
   RD.open(file);
-  assert(Operator._A.size()==Operator.geom.npoint);
+  GRID_ASSERT(Operator._A.size()==Operator.geom.npoint);
   for(int p=0;p<Operator.geom.npoint;p++){
     auto tmp=Operator.Cell.Extract(Operator._A[p]);
     RD.readScidacFieldRecord(tmp,record,0);
@@ -126,10 +126,10 @@ public:
   void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
   void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
   void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
-  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void OpDiag (const Field &in, Field &out)                  {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    GRID_ASSERT(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
 };
 /*
 template<class Field> class ChebyshevSmoother : public LinearFunction<Field>
diff --git a/tests/debug/Test_general_coarse_hdcg_phys48.cc b/tests/debug/Test_general_coarse_hdcg_phys48.cc
index 20772eb5..5383ff60 100644
--- a/tests/debug/Test_general_coarse_hdcg_phys48.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48.cc
@@ -44,7 +44,7 @@ void SaveOperator(Coarsened &Operator,std::string file)
 #ifdef HAVE_LIME
   emptyUserRecord record;
   ScidacWriter WR(Operator.Grid()->IsBoss());
-  assert(Operator._A.size()==Operator.geom.npoint);
+  GRID_ASSERT(Operator._A.size()==Operator.geom.npoint);
   WR.open(file);
   for(int p=0;p<Operator._A.size();p++){
     auto tmp = Operator.Cell.Extract(Operator._A[p]);
@@ -61,7 +61,7 @@ void LoadOperator(Coarsened &Operator,std::string file)
   emptyUserRecord record;
   Grid::ScidacReader RD ;
   RD.open(file);
-  assert(Operator._A.size()==Operator.geom.npoint);
+  GRID_ASSERT(Operator._A.size()==Operator.geom.npoint);
   for(int p=0;p<Operator.geom.npoint;p++){
     conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
     //    RD.readScidacFieldRecord(Operator._A[p],record,BINARYIO_LEXICOGRAPHIC);
@@ -78,7 +78,7 @@ void ReLoadOperator(Coarsened &Operator,std::string file)
   emptyUserRecord record;
   Grid::ScidacReader RD ;
   RD.open(file);
-  assert(Operator._A.size()==Operator.geom.npoint);
+  GRID_ASSERT(Operator._A.size()==Operator.geom.npoint);
   for(int p=0;p<Operator.geom.npoint;p++){
     auto tmp=Operator.Cell.Extract(Operator._A[p]);
     RD.readScidacFieldRecord(tmp,record,0);
@@ -147,7 +147,7 @@ void LoadEigenvectors(std::vector<RealD>            &eval,
 
     Grid::ScidacReader RD ;
     RD.open(evec_file);
-    assert(evec.size()==eval.size());
+    GRID_ASSERT(evec.size()==eval.size());
     for(int k=0;k<eval.size();k++) {
       RD.readScidacFieldRecord(evec[k],record);
     }
@@ -165,10 +165,10 @@ public:
   void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
   void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
   void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
-  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void OpDiag (const Field &in, Field &out)                  {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    GRID_ASSERT(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
 };
 
 template<class Field> class CGSmoother : public LinearFunction<Field>
diff --git a/tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc b/tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
index 4a7890f4..722253ea 100644
--- a/tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_blockcg.cc
@@ -148,7 +148,7 @@ void LoadEigenvectors(std::vector<RealD>            &eval,
 
     Grid::ScidacReader RD ;
     RD.open(evec_file);
-    assert(evec.size()==eval.size());
+    GRID_ASSERT(evec.size()==eval.size());
     for(int k=0;k<eval.size();k++) {
       RD.readScidacFieldRecord(evec[k],record);
     }
@@ -166,10 +166,10 @@ public:
   void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
   void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
   void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
-  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void OpDiag (const Field &in, Field &out)                  {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    GRID_ASSERT(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
 };
 
 template<class Field> class FixedCGPolynomial : public LinearFunction<Field>
diff --git a/tests/debug/Test_general_coarse_hdcg_phys48_lanczos.cc b/tests/debug/Test_general_coarse_hdcg_phys48_lanczos.cc
index 240c2d6b..6ac65c5f 100644
--- a/tests/debug/Test_general_coarse_hdcg_phys48_lanczos.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_lanczos.cc
@@ -148,7 +148,7 @@ void LoadEigenvectors(std::vector<RealD>            &eval,
 
     Grid::ScidacReader RD ;
     RD.open(evec_file);
-    assert(evec.size()==eval.size());
+    GRID_ASSERT(evec.size()==eval.size());
     for(int k=0;k<eval.size();k++) {
       RD.readScidacFieldRecord(evec[k],record);
     }
@@ -166,10 +166,10 @@ public:
   void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
   void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
   void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
-  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void OpDiag (const Field &in, Field &out)                  {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    GRID_ASSERT(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
 };
 
 template<class Field> class FixedCGPolynomial : public LinearFunction<Field>
diff --git a/tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc b/tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
index 6efe53bd..77f1b091 100644
--- a/tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_lanczos_subspace.cc
@@ -119,7 +119,7 @@ void LoadEigenvectors(std::vector<RealD>            &eval,
 
     Grid::ScidacReader RD ;
     RD.open(evec_file);
-    assert(evec.size()==eval.size());
+    GRID_ASSERT(evec.size()==eval.size());
     for(int k=0;k<eval.size();k++) {
       RD.readScidacFieldRecord(evec[k],record);
     }
@@ -137,10 +137,10 @@ public:
   void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
   void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
   void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
-  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void OpDiag (const Field &in, Field &out)                  {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    GRID_ASSERT(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
 };
 
 template<class Field> class CGSmoother : public LinearFunction<Field>
diff --git a/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc b/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc
index 054b4c66..fb806bf2 100644
--- a/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48_mixed.cc
@@ -92,7 +92,7 @@ void LoadEigenvectors(std::vector<RealD>            &eval,
 
     Grid::ScidacReader RD ;
     RD.open(evec_file);
-    assert(evec.size()==eval.size());
+    GRID_ASSERT(evec.size()==eval.size());
     for(int k=0;k<eval.size();k++) {
       RD.readScidacFieldRecord(evec[k],record);
     }
@@ -110,10 +110,10 @@ public:
   void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
   void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
   void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
-  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void OpDiag (const Field &in, Field &out)                  {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    GRID_ASSERT(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
 };
 
 template<class Field> class CGSmoother : public LinearFunction<Field>
diff --git a/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc b/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
index c45b2cb1..abfb0afe 100644
--- a/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys96_mixed.cc
@@ -92,7 +92,7 @@ void LoadEigenvectors(std::vector<RealD>            &eval,
 
     Grid::ScidacReader RD ;
     RD.open(evec_file);
-    assert(evec.size()==eval.size());
+    GRID_ASSERT(evec.size()==eval.size());
     for(int k=0;k<eval.size();k++) {
       RD.readScidacFieldRecord(evec[k],record);
     }
@@ -110,10 +110,10 @@ public:
   void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
   void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
   void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
-  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void OpDiag (const Field &in, Field &out)                  {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    GRID_ASSERT(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
 };
 
 template<class Field> class CGSmoother : public LinearFunction<Field>
diff --git a/tests/debug/Test_general_coarse_pvdagm.cc b/tests/debug/Test_general_coarse_pvdagm.cc
index 096574d0..d0ea894c 100644
--- a/tests/debug/Test_general_coarse_pvdagm.cc
+++ b/tests/debug/Test_general_coarse_pvdagm.cc
@@ -43,9 +43,9 @@ class PVdagMLinearOperator : public LinearOperatorBase<Field> {
 public:
   PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
 
-  void OpDiag (const Field &in, Field &out) {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void OpDiag (const Field &in, Field &out) {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    GRID_ASSERT(0);  };
   void Op     (const Field &in, Field &out){
     //    std::cout << "Op: PVdag M "<<std::endl;
     Field tmp(in.Grid());
@@ -58,7 +58,7 @@ public:
     _PV.M(in,tmp);
     _Mat.Mdag(tmp,out);
   }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
   void HermOp(const Field &in, Field &out){
     //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
     Field tmp(in.Grid());
@@ -79,9 +79,9 @@ class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
 public:
   ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
 
-  void OpDiag (const Field &in, Field &out) {    assert(0);  }
-  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
-  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void OpDiag (const Field &in, Field &out) {    GRID_ASSERT(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    GRID_ASSERT(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    GRID_ASSERT(0);  };
   void Op     (const Field &in, Field &out){
     //    std::cout << "Op: PVdag M "<<std::endl;
     Field tmp(in.Grid());
@@ -96,7 +96,7 @@ public:
     _Mat.Mdag(in,tmp);
     out = out + shift * in;
   }
-  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    GRID_ASSERT(0);  }
   void HermOp(const Field &in, Field &out){
     //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
     Field tmp(in.Grid());
@@ -368,7 +368,10 @@ int main (int argc, char ** argv)
   TrivialPrecon<CoarseVector> simple;
   NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
   //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
-  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-2, 100, LinOpCoarse,simple,10,10); 
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-2, 100, LinOpCoarse,simple,12,12);  // 35 outer
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(5.0e-2, 100, LinOpCoarse,simple,12,12);  // 36 outer, 12s
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-1, 100, LinOpCoarse,simple,12,12);  // 36 ; 11s   
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-1, 100, LinOpCoarse,simple,12,12);     
   L2PGCR.Level(3);
   c_res=Zero();
   L2PGCR(c_src,c_res);
@@ -400,7 +403,7 @@ int main (int argc, char ** argv)
 			    LinOpCoarse,
 			    L2PGCR);
   
-  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,100,PVdagM,TwoLevelPrecon,10,10);
   L1PGCR.Level(1);
 
   f_res=Zero();
diff --git a/tests/debug/Test_general_coarse_pvdagm_svd.cc b/tests/debug/Test_general_coarse_pvdagm_svd.cc
new file mode 100644
index 00000000..85589ebd
--- /dev/null
+++ b/tests/debug/Test_general_coarse_pvdagm_svd.cc
@@ -0,0 +1,493 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<< "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class MdagPVLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  MdagPVLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    //    std::cout <<GridLogMessage<< "Op: PVdag M "<<std::endl;
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"HermOp: PVdag M Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+};
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditionerSVD : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _FineToCoarse;
+  Aggregates     & _CoarseToFine;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditionerSVD(Aggregates &FtoC,
+		      Aggregates &CtoF,
+		      FineOperator &Fine,
+		      FineSmoother &PreSmoother,
+		      FineSmoother &PostSmoother,
+		      CoarseOperator &CoarseOperator_,
+		      CoarseSolver &CoarseSolve_)
+    : _FineToCoarse(FtoC),
+      _CoarseToFine(CtoF),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _FineToCoarse.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _FineToCoarse.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _CoarseToFine.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+  
+  RealD mass=0.01;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  const int nbasis = 30;
+  const int cb = 0 ;
+
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  typedef MdagPVLinearOperator<DomainWallFermionD,LatticeFermionD> MdagPV_t;
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+  PVdagM_t PVdagM(Ddwf,Dpv);
+  MdagPV_t MdagPV(Ddwf,Dpv);
+  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
+  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
+  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
+
+
+  // Run power method on HOA??
+  PowerMethod<LatticeFermion>       PM;
+  //  PM(PVdagM,src);
+  //  PM(MdagPV,src);
+ 
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace V(Coarse5d,FGrid,cb);
+  Subspace U(Coarse5d,FGrid,cb);
+
+  // Breeds right singular vectors with call to HermOp (V)
+  V.CreateSubspaceChebyshev(RNG5,PVdagM,
+			    nbasis,
+			    4000.0,0.003,
+			    500);
+
+  // Breeds left singular vectors with call to HermOp (U)
+  //  U.CreateSubspaceChebyshev(RNG5,PVdagM,
+  U.CreateSubspaceChebyshev(RNG5,MdagPV,
+			    nbasis,
+			    4000.0,0.003,
+			    500);
+  
+  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
+  CombinedSubspace CombinedUV(Coarse5d,FGrid,cb);
+  for(int b=0;b<nbasis;b++){
+    CombinedUV.subspace[b]        = V.subspace[b];
+    CombinedUV.subspace[b+nbasis] = U.subspace[b];
+  }
+
+  int bl, br;
+  std::cout <<" <V| PVdagM| V> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(V.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <V| PVdagM| U> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(U.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <U| PVdagM| V> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(V.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <U| PVdagM| U> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(U.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
+  }}
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperatorV;
+  typedef LittleDiracOperatorV::CoarseVector CoarseVectorV;
+  
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  V.Orthogonalise();
+  for(int b =0 ; b<nbasis;b++){
+    CoarseVectorV c_src (Coarse5d);
+    V.ProjectToSubspace  (c_src,U.subspace[b]);
+    V.PromoteFromSubspace(c_src,src);
+    std::cout << " Completeness of U in V ["<< b<<"] "<< std::sqrt(norm2(src)/norm2(U.subspace[b]))<<std::endl;
+  }
+  
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  CoarseVector c_proj(Coarse5d);
+  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
+  LittleDiracOpPV.CoarsenOperator(PVdagM,CombinedUV,CombinedUV);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+
+  blockPromote(c_src,err,CombinedUV.subspace);
+
+  LatticeFermion prom(FGrid);
+  prom=Zero();
+  for(int b=0;b<nbasis*2;b++){
+    prom=prom+CombinedUV.subspace[b];
+  }
+
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  PVdagM.Op(prom,tmp);
+  blockProject(c_proj,tmp,CombinedUV.subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOpPV.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 10, LinOpCoarse,simple,20,20); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis*2> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(CombinedUV,CombinedUV,
+			    PVdagM,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,20,20);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
diff --git a/tests/debug/Test_general_coarse_pvdagm_svd_cg.cc b/tests/debug/Test_general_coarse_pvdagm_svd_cg.cc
new file mode 100644
index 00000000..06f7632e
--- /dev/null
+++ b/tests/debug/Test_general_coarse_pvdagm_svd_cg.cc
@@ -0,0 +1,492 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<< "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class MdagPVLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  MdagPVLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    //    std::cout <<GridLogMessage<< "Op: PVdag M "<<std::endl;
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"HermOp: PVdag M Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+};
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditionerSVD : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _FineToCoarse;
+  Aggregates     & _CoarseToFine;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditionerSVD(Aggregates &FtoC,
+		      Aggregates &CtoF,
+		      FineOperator &Fine,
+		      FineSmoother &PreSmoother,
+		      FineSmoother &PostSmoother,
+		      CoarseOperator &CoarseOperator_,
+		      CoarseSolver &CoarseSolve_)
+    : _FineToCoarse(FtoC),
+      _CoarseToFine(CtoF),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _FineToCoarse.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _FineToCoarse.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _CoarseToFine.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+  
+  RealD mass=0.01;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  const int nbasis = 20;
+  const int cb = 0 ;
+
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  typedef MdagPVLinearOperator<DomainWallFermionD,LatticeFermionD> MdagPV_t;
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+  PVdagM_t PVdagM(Ddwf,Dpv);
+  MdagPV_t MdagPV(Ddwf,Dpv);
+  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
+  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
+  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
+
+
+  // Run power method on HOA??
+  PowerMethod<LatticeFermion>       PM;
+  //  PM(PVdagM,src);
+  //  PM(MdagPV,src);
+ 
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace V(Coarse5d,FGrid,cb);
+  Subspace U(Coarse5d,FGrid,cb);
+
+  // Breeds right singular vectors with call to HermOp (V)
+  V.CreateSubspace(RNG5,PVdagM,nbasis);
+
+  // Breeds left singular vectors with call to HermOp (U)
+  //  U.CreateSubspaceChebyshev(RNG5,MdagPV,
+  U.CreateSubspace(RNG5,PVdagM,nbasis);
+  
+  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
+  CombinedSubspace CombinedUV(Coarse5d,FGrid,cb);
+  for(int b=0;b<nbasis;b++){
+    CombinedUV.subspace[b]        = V.subspace[b];
+    CombinedUV.subspace[b+nbasis] = U.subspace[b];
+  }
+
+  int bl, br;
+  std::cout <<" <V| PVdagM| V> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(V.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <V| PVdagM| U> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(U.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <U| PVdagM| V> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(V.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <U| PVdagM| U> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(U.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
+  }}
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperatorV;
+  typedef LittleDiracOperatorV::CoarseVector CoarseVectorV;
+  
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  V.Orthogonalise();
+  for(int b =0 ; b<nbasis;b++){
+    CoarseVectorV c_src (Coarse5d);
+    V.ProjectToSubspace  (c_src,U.subspace[b]);
+    V.PromoteFromSubspace(c_src,src);
+    std::cout << " Completeness of U in V ["<< b<<"] "<< std::sqrt(norm2(src)/norm2(U.subspace[b]))<<std::endl;
+  }
+  
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  CoarseVector c_proj(Coarse5d);
+  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
+  LittleDiracOpPV.CoarsenOperator(PVdagM,CombinedUV,CombinedUV);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+
+  blockPromote(c_src,err,CombinedUV.subspace);
+
+  LatticeFermion prom(FGrid);
+  prom=Zero();
+  for(int b=0;b<nbasis*2;b++){
+    prom=prom+CombinedUV.subspace[b];
+  }
+
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  PVdagM.Op(prom,tmp);
+  blockProject(c_proj,tmp,CombinedUV.subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOpPV.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 10, LinOpCoarse,simple,20,20); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis*2> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(CombinedUV,CombinedUV,
+			    PVdagM,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,20,20);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
diff --git a/tests/debug/Test_general_coarse_pvdagm_svd_uv.cc b/tests/debug/Test_general_coarse_pvdagm_svd_uv.cc
new file mode 100644
index 00000000..299178fe
--- /dev/null
+++ b/tests/debug/Test_general_coarse_pvdagm_svd_uv.cc
@@ -0,0 +1,479 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<< "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class MdagPVLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  MdagPVLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    //    std::cout <<GridLogMessage<< "Op: PVdag M "<<std::endl;
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"HermOp: PVdag M Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+};
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditionerSVD : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  ///////////////////////////////
+  // SVD is M = U S Vdag
+  //
+  // Define a subset of Vc and Uc in Complex_f,c  matrix
+  // - these are the coarsening, non-square matrices
+  //
+  // Solve a coarse approx to
+  //
+  //      M psi = eta
+  //
+  //  via
+  //
+  //  Uc^dag U S Vdag Vc Vc^dag psi = Uc^dag eta
+  //
+  //  M_coarse Vc^dag psi = M_coarse psi_c = eta_c
+  //  
+  ///////////////////////////////
+  Aggregates     & _U;
+  Aggregates     & _V;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditionerSVD(Aggregates &U,
+		      Aggregates &V,
+		      FineOperator &Fine,
+		      FineSmoother &PreSmoother,
+		      FineSmoother &PostSmoother,
+		      CoarseOperator &CoarseOperator_,
+		      CoarseSolver &CoarseSolve_)
+    : _U(U),
+      _V(V),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _U.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    //  Uc^dag U S Vdag Vc Vc^dag psi = Uc^dag eta
+    // Fine to Coarse 
+    t=-usecond();
+    _U.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _V.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+  
+  RealD mass=0.01;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  const int nbasis = 60;
+  const int cb = 0 ;
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  typedef MdagPVLinearOperator<DomainWallFermionD,LatticeFermionD> MdagPV_t;
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+  PVdagM_t PVdagM(Ddwf,Dpv);
+  MdagPV_t MdagPV(Ddwf,Dpv);
+  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
+  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
+  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
+
+
+  // Run power method on HOA??
+  PowerMethod<LatticeFermion>       PM;
+  PM(PVdagM,src);
+  PM(MdagPV,src);
+
+ 
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace V(Coarse5d,FGrid,cb);
+  //  Subspace U(Coarse5d,FGrid,cb);
+
+  // Breeds right singular vectors with call to HermOp
+  V.CreateSubspaceChebyshev(RNG5,PVdagM,
+			    nbasis,
+			    4000.0,0.003,
+			    300);
+
+  // Breeds left singular vectors with call to HermOp
+  //  U.CreateSubspaceChebyshev(RNG5,MdagPV,
+  //			    nbasis,
+  //			    4000.0,0.003,
+  //			    300);
+  //  U.subspace=V.subspace;
+  
+  //  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
+  //  CombinedSubspace CombinedUV(Coarse5d,FGrid,cb);
+  //  for(int b=0;b<nbasis;b++){
+  //    CombinedUV.subspace[b]        = V.subspace[b];
+  //    CombinedUV.subspace[b+nbasis] = U.subspace[b];
+  //  }
+  
+  
+  //  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
+  LittleDiracOpPV.CoarsenOperator(PVdagM,V,V);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  CoarseVector c_proj(Coarse5d);
+
+
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+
+  //  blockPromote(c_src,err,CoarseToFine.subspace);
+
+  LatticeFermion prom(FGrid);
+  prom=Zero();
+  for(int b=0;b<nbasis;b++){
+    prom=prom+V.subspace[b];
+  }
+
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  PVdagM.Op(prom,tmp);
+  blockProject(c_proj,tmp,V.subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOpPV.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L3PGCR(1.0e-4, 10, LinOpCoarse,simple,20,20); 
+  L3PGCR.Level(3);
+  c_res=Zero();
+  L3PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  //  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis*2> TwoLevelMG;
+  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
+
+  //  TwoLevelMG TwoLevelPrecon(CombinedUV,CombinedUV,
+  TwoLevelMG TwoLevelPrecon(V,V,
+			    PVdagM,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L3PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
diff --git a/tests/debug/Test_general_coarse_wilson.cc b/tests/debug/Test_general_coarse_wilson.cc
new file mode 100644
index 00000000..d2845cb8
--- /dev/null
+++ b/tests/debug/Test_general_coarse_wilson.cc
@@ -0,0 +1,333 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
+
+  Complex one(1.0);
+
+  LatticeFermion    src(FGrid); src=one;
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeFermion    precsrc(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  RealD csw =0.0;
+  RealD mass=-0.92;
+
+  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
+
+  const int nbasis = 20;
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+ 
+  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse4d,FGrid,cb);
+
+  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
+  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.01);
+
+  Aggregates.CreateSubspaceGCR(RNG4,
+			       LinOpDw,
+			       nbasis);
+  
+  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
+  CombinedSubspace CombinedUV(Coarse4d,UGrid,cb);
+  for(int b=0;b<nbasis;b++){
+    Gamma G5(Gamma::Algebra::Gamma5);
+    CombinedUV.subspace[b]        = Aggregates.subspace[b];
+    CombinedUV.subspace[b+nbasis] = G5*Aggregates.subspace[b];
+  }
+
+  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
+  LittleDiracOp.CoarsenOperator(LinOpDw,CombinedUV);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse4d);
+  CoarseVector c_res (Coarse4d);
+  CoarseVector c_proj(Coarse4d);
+
+  std::vector<LatticeFermion> subspace(2*nbasis,FGrid);
+  subspace=CombinedUV.subspace;
+
+  c_src = one;  // 1 in every element for vector 1.
+  blockPromote(c_src,err,subspace);
+
+  prom=Zero();
+  for(int b=0;b<2*nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  LinOpDw.Op(prom,tmp);
+  blockProject(c_proj,tmp,subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOp.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  // CG
+  {
+    MdagMLinearOperator<WilsonFermionD,LatticeFermion> HermOp(Dw);
+    ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+    Dw.Mdag(src,precsrc);
+    CG(HermOp,precsrc,result);
+    result=Zero();
+  }
+ 
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
+  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.001);
+  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.01);
+  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LinOpCoarse,0.001);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-1, 100, LinOpCoarse,simple,30,30); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(2.0e-1, 50, ShiftedLinOpCoarse,simple,50,50); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.1,1,ShiftedLinOpDw,simple_fine,4,4);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,2*nbasis> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(CombinedUV,
+			    LinOpDw,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,16,16);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
diff --git a/tests/debug/Test_general_coarse_wilson_nog5.cc b/tests/debug/Test_general_coarse_wilson_nog5.cc
new file mode 100644
index 00000000..c3382c25
--- /dev/null
+++ b/tests/debug/Test_general_coarse_wilson_nog5.cc
@@ -0,0 +1,326 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
+
+  Complex one(1.0);
+
+  LatticeFermion    src(FGrid); src=one;
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeFermion    precsrc(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  RealD csw =0.0;
+  RealD mass=-0.92;
+
+  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
+
+  const int nbasis = 40;
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+ 
+  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse4d,FGrid,cb);
+
+  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
+  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.01);
+
+  Aggregates.CreateSubspaceGCR(RNG4,
+			       LinOpDw,
+			       nbasis);
+  
+
+  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
+  LittleDiracOp.CoarsenOperator(LinOpDw,Aggregates);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse4d);
+  CoarseVector c_res (Coarse4d);
+  CoarseVector c_proj(Coarse4d);
+
+  std::vector<LatticeFermion> subspace(nbasis,FGrid);
+  subspace=Aggregates.subspace;
+
+  c_src = one;  // 1 in every element for vector 1.
+  blockPromote(c_src,err,subspace);
+
+  prom=Zero();
+  for(int b=0;b<nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  LinOpDw.Op(prom,tmp);
+  blockProject(c_proj,tmp,subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOp.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  // CG
+  {
+    MdagMLinearOperator<WilsonFermionD,LatticeFermion> HermOp(Dw);
+    ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+    Dw.Mdag(src,precsrc);
+    CG(HermOp,precsrc,result);
+    result=Zero();
+  }
+ 
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
+  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.001);
+  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.01);
+  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LinOpCoarse,0.001);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-1, 100, LinOpCoarse,simple,30,30); 
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(2.0e-1, 50, ShiftedLinOpCoarse,simple,50,50); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.1,1,ShiftedLinOpDw,simple_fine,6,6);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(Aggregates,
+			    LinOpDw,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,16,16);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
diff --git a/tests/debug/Test_general_coarse_wilson_svd.cc b/tests/debug/Test_general_coarse_wilson_svd.cc
new file mode 100644
index 00000000..1d058878
--- /dev/null
+++ b/tests/debug/Test_general_coarse_wilson_svd.cc
@@ -0,0 +1,320 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG4,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat");
+  NerscIO::readConfiguration(Umu,header,file);
+  
+  RealD csw =0.0;
+  RealD mass=-0.92;
+
+  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
+
+  const int nbasis = 20;
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+ 
+  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse4d,FGrid,cb);
+
+  MdagMLinearOperator<WilsonCloverFermionD,LatticeFermion> MdagMOpDw(Dw);
+  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
+  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.5);
+
+  //  Aggregates.CreateSubspaceGCR(RNG4,
+  //			       LinOpDw,
+  //			       nbasis);
+  Aggregates.CreateSubspace(RNG4,MdagMOpDw,nbasis);
+  
+  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
+  CombinedSubspace CombinedUV(Coarse4d,UGrid,cb);
+  for(int b=0;b<nbasis;b++){
+    Gamma G5(Gamma::Algebra::Gamma5);
+    CombinedUV.subspace[b]        = Aggregates.subspace[b];
+    CombinedUV.subspace[b+nbasis] = G5*Aggregates.subspace[b];
+  }
+
+  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
+  LittleDiracOp.CoarsenOperator(LinOpDw,CombinedUV);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse4d);
+  CoarseVector c_res (Coarse4d);
+  CoarseVector c_proj(Coarse4d);
+
+  std::vector<LatticeFermion> subspace(2*nbasis,FGrid);
+  subspace=CombinedUV.subspace;
+
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+  blockPromote(c_src,err,subspace);
+
+  prom=Zero();
+  for(int b=0;b<2*nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  LinOpDw.Op(prom,tmp);
+  blockProject(c_proj,tmp,subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOp.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 100, LinOpCoarse,simple,30,30); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedLinOpDw,simple_fine,4,4);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,2*nbasis> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(CombinedUV,
+			    LinOpDw,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,32,32);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
diff --git a/tests/debug/Test_general_coarse_wilson_svd_no5g.cc b/tests/debug/Test_general_coarse_wilson_svd_no5g.cc
new file mode 100644
index 00000000..ad6e59fa
--- /dev/null
+++ b/tests/debug/Test_general_coarse_wilson_svd_no5g.cc
@@ -0,0 +1,312 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG4,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat");
+  NerscIO::readConfiguration(Umu,header,file);
+  
+  RealD csw =0.0;
+  RealD mass=-0.92;
+
+  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
+
+  const int nbasis = 40;
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+ 
+  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse4d,FGrid,cb);
+
+  MdagMLinearOperator<WilsonCloverFermionD,LatticeFermion> MdagMOpDw(Dw);
+  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
+  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.5);
+
+  //  Aggregates.CreateSubspaceGCR(RNG4,
+  //			       LinOpDw,
+  //			       nbasis);
+  Aggregates.CreateSubspace(RNG4,MdagMOpDw,nbasis);
+
+  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
+  LittleDiracOp.CoarsenOperator(LinOpDw,Aggregates);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse4d);
+  CoarseVector c_res (Coarse4d);
+  CoarseVector c_proj(Coarse4d);
+
+  std::vector<LatticeFermion> subspace(nbasis,FGrid);
+  subspace=Aggregates.subspace;
+
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+  blockPromote(c_src,err,subspace);
+
+  prom=Zero();
+  for(int b=0;b<nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  LinOpDw.Op(prom,tmp);
+  blockProject(c_proj,tmp,subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOp.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 100, LinOpCoarse,simple,30,30); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedLinOpDw,simple_fine,6,6);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(Aggregates,
+			    LinOpDw,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,32,32);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
diff --git a/tests/debug/Test_iwasaki_action_newstaple.cc b/tests/debug/Test_iwasaki_action_newstaple.cc
index 06bdaadf..5d08dd78 100644
--- a/tests/debug/Test_iwasaki_action_newstaple.cc
+++ b/tests/debug/Test_iwasaki_action_newstaple.cc
@@ -177,7 +177,7 @@ int main (int argc, char ** argv)
     GaugeLorentz diff = derivOrig - derivNew;
     double n = norm2(diff);
     std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
-    assert(n<1e-10);
+    GRID_ASSERT(n<1e-10);
 
     std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms,  new: " << (t2-t1)/1000 << "ms" << std::endl;
     torig += (t1-t0)/1000; tnew += (t2-t1)/1000;
diff --git a/tests/debug/Test_optimized_staple_gaugebc.cc b/tests/debug/Test_optimized_staple_gaugebc.cc
index 51628ab0..06f2d19b 100644
--- a/tests/debug/Test_optimized_staple_gaugebc.cc
+++ b/tests/debug/Test_optimized_staple_gaugebc.cc
@@ -86,7 +86,7 @@ int main (int argc, char ** argv)
     GaugeMat diff = staple_orig - staple_opt;
     double n = norm2(diff);
     std::cout << GridLogMessage << mu << " " << n << std::endl;
-    assert(n<1e-10);
+    GRID_ASSERT(n<1e-10);
   }
   std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  optimized: " << topt/1000/count << "ms" << std::endl;
   
diff --git a/tests/debug/Test_padded_cell.cc b/tests/debug/Test_padded_cell.cc
index 593b3542..3af55669 100644
--- a/tests/debug/Test_padded_cell.cc
+++ b/tests/debug/Test_padded_cell.cc
@@ -106,7 +106,7 @@ int main (int argc, char ** argv)
     peekLocalSite(g,Ug_v,gcoor);
     peekLocalSite(l,Ul_v,lcoor);
     g=g-l;
-    assert(norm2(g)==0);
+    GRID_ASSERT(norm2(g)==0);
     diff = diff + norm2(g);
     n = n + norm2(l);
   }}}}
@@ -198,6 +198,6 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Average plaquette via padded cell "<<result<<std::endl;
   std::cout << GridLogMessage << " Diff "<<result-plaq<<std::endl;
   
-  assert(fabs(result-plaq)<1.0e-8);
+  GRID_ASSERT(fabs(result-plaq)<1.0e-8);
   Grid_finalize();
 }
diff --git a/tests/debug/Test_padded_cell_staple.cc b/tests/debug/Test_padded_cell_staple.cc
index 326f8810..cdc1337b 100644
--- a/tests/debug/Test_padded_cell_staple.cc
+++ b/tests/debug/Test_padded_cell_staple.cc
@@ -572,7 +572,7 @@ int main (int argc, char ** argv)
     GaugeMat diff = staple_orig - staple_padded;
     double n = norm2(diff);
     std::cout << GridLogMessage << mu << " " << n << std::endl;
-    assert(n<1e-10);
+    GRID_ASSERT(n<1e-10);
   }
   std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms,  padded: " << tpadded/1000/count << "ms" << std::endl;
   
diff --git a/tests/disable_tests_without_instantiations.h b/tests/disable_tests_without_instantiations.h
new file mode 100644
index 00000000..dbc50d5b
--- /dev/null
+++ b/tests/disable_tests_without_instantiations.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#ifndef BUILD_FERMION_INSTANTIATIONS
+#include <iostream>
+
+int main(void) {
+  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
+	    << "which this test relies on. "
+	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
+	    << "to run this test."
+	    << std::endl;
+  return 1;
+}
+#endif
diff --git a/tests/forces/Test_bdy.cc b/tests/forces/Test_bdy.cc
index c2c97d0d..d328b49f 100644
--- a/tests/forces/Test_bdy.cc
+++ b/tests/forces/Test_bdy.cc
@@ -179,7 +179,7 @@ void ForceTest(Action<LatticeGaugeField> &action,LatticeGaugeField & U,MomentumF
   std::cout<< GridLogMessage << "dSpred : "<< dSpred.real() <<std::endl;
   std::cout<< GridLogMessage << "diff : "<< diff<<std::endl;
   std::cout<< GridLogMessage << "*********************************************************"<<std::endl;
-  //  assert(diff<1.0);
+  //  GRID_ASSERT(diff<1.0);
   std::cout<< GridLogMessage << "Done" <<std::endl;
   std::cout << GridLogMessage << "*********************************************************"<<std::endl;
 }
diff --git a/tests/forces/Test_contfrac_force.cc b/tests/forces/Test_contfrac_force.cc
index 526cde12..5424e762 100644
--- a/tests/forces/Test_contfrac_force.cc
+++ b/tests/forces/Test_contfrac_force.cc
@@ -146,7 +146,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
   std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "predict dS    "<< dSpred <<std::endl;
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0e-2 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0e-2 ) ;
 
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_double_ratio.cc b/tests/forces/Test_double_ratio.cc
index a2b16719..418ba87b 100644
--- a/tests/forces/Test_double_ratio.cc
+++ b/tests/forces/Test_double_ratio.cc
@@ -376,7 +376,7 @@ void ForceTest(Action<LatticeGaugeField> &action,LatticeGaugeField & U,MomentumF
   std::cout<< GridLogMessage << "dSpred : "<< dSpred.real() <<std::endl;
   std::cout<< GridLogMessage << "diff : "<< diff<<std::endl;
   std::cout<< GridLogMessage << "*********************************************************"<<std::endl;
-  //  assert(diff<1.0);
+  //  GRID_ASSERT(diff<1.0);
   std::cout<< GridLogMessage << "Done" <<std::endl;
   std::cout << GridLogMessage << "*********************************************************"<<std::endl;
 }
diff --git a/tests/forces/Test_dwf_force.cc b/tests/forces/Test_dwf_force.cc
index 1ae28bb2..6a3e7c6d 100644
--- a/tests/forces/Test_dwf_force.cc
+++ b/tests/forces/Test_dwf_force.cc
@@ -150,7 +150,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "predict dS    "<< dSpred <<std::endl;
 
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
 
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_dwf_force_eofa.cc b/tests/forces/Test_dwf_force_eofa.cc
index d820573b..fa1ae3a7 100644
--- a/tests/forces/Test_dwf_force_eofa.cc
+++ b/tests/forces/Test_dwf_force_eofa.cc
@@ -166,7 +166,7 @@ int main (int argc, char** argv)
   printf("real(dS_predict) = %1.15e\n", dSpred.real());
   printf("imag(dS_predict) = %1.15e\n\n", dSpred.imag());
 
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
 
   std::cout << GridLogMessage << "Done" << std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc
index db61813e..027af145 100644
--- a/tests/forces/Test_dwf_gpforce.cc
+++ b/tests/forces/Test_dwf_gpforce.cc
@@ -241,7 +241,7 @@ int main (int argc, char ** argv)
 
   std::cout << GridLogMessage << "dS - dt^2 term "<< Hmomprime - Hmom + Sprime - S - dSm2 <<std::endl;
   
-  assert( fabs(real(Sprime-S-dSpred)) < 5.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 5.0 ) ;
 
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_dwf_gpforce_eofa.cc b/tests/forces/Test_dwf_gpforce_eofa.cc
index d488cc7d..3e89f319 100644
--- a/tests/forces/Test_dwf_gpforce_eofa.cc
+++ b/tests/forces/Test_dwf_gpforce_eofa.cc
@@ -170,7 +170,7 @@ int main (int argc, char** argv)
   printf("real(dS_predict) = %1.15e\n", dSpred.real());
   printf("imag(dS_predict) = %1.15e\n\n", dSpred.imag());
 
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
 
   std::cout << GridLogMessage << "Done" << std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_fthmc.cc b/tests/forces/Test_fthmc.cc
index 50f60f99..2b0d20d5 100644
--- a/tests/forces/Test_fthmc.cc
+++ b/tests/forces/Test_fthmc.cc
@@ -62,6 +62,7 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
   
   Gimpl::generate_momenta(P,sRNG,RNG4);
   //  Filter.applyFilter(P);
+  std::cout << GridLogMessage << "Initial momenta " << norm2(P) << std::endl;
 
   action.refresh(smU,sRNG,RNG4);
 
@@ -70,6 +71,8 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
   std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
 
   RealD S1 = action.S(smU);
+  std::cout << GridLogMessage << "Initial action " << S1 << std::endl;
+
 
   Gimpl::update_field(P,U,eps);
   smU.set_Field(U);
@@ -80,6 +83,7 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
   action.deriv(smU,UdSdU);
   UdSdU = Ta(UdSdU);
   //  Filter.applyFilter(UdSdU);
+  std::cout << GridLogMessage << "Derivative " << norm2(UdSdU) << std::endl;
 
   DumpSliceNorm("Force",UdSdU,Nd-1);
   
@@ -91,6 +95,7 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
   std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
   
   RealD S2 = action.S(smU);
+  std::cout << GridLogMessage << "Final action " << S1 << std::endl;
 
   // Use the derivative
   LatticeComplex dS(UGrid); dS = Zero();
@@ -109,7 +114,7 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
   std::cout<< GridLogMessage << "dSpred : "<< dSpred.real() <<std::endl;
   std::cout<< GridLogMessage << "diff : "<< diff<<std::endl;
   std::cout<< GridLogMessage << "*********************************************************"<<std::endl;
-  //  assert(diff<1.0);
+  //  GRID_ASSERT(diff<1.0);
   std::cout<< GridLogMessage << "Done" <<std::endl;
   std::cout << GridLogMessage << "*********************************************************"<<std::endl;
 }
@@ -145,6 +150,8 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds);
   SU<Nc>::HotConfiguration(RNG4,U);
 #endif
+  std::cout << GridLogMessage << "Initial plaquette: " << WilsonLoops<PeriodicGimplR>::avgPlaquette(U) << std::endl;
+
 
   
   WilsonGaugeActionR  PlaqAction(6.0);
diff --git a/tests/forces/Test_gp_plaq_force.cc b/tests/forces/Test_gp_plaq_force.cc
index bc2b5b26..a4b4ef90 100644
--- a/tests/forces/Test_gp_plaq_force.cc
+++ b/tests/forces/Test_gp_plaq_force.cc
@@ -120,7 +120,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
   std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "pred dS "<< dSpred <<std::endl;
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0e-2 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0e-2 ) ;
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
 }
diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc
index e277ea6b..ee04c2e2 100644
--- a/tests/forces/Test_gp_rect_force.cc
+++ b/tests/forces/Test_gp_rect_force.cc
@@ -125,7 +125,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
   std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "pred dS "<< dSpred <<std::endl;
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0e-1 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0e-1 ) ;
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
 }
diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc
index f763d12e..413c0c84 100644
--- a/tests/forces/Test_gpdwf_force.cc
+++ b/tests/forces/Test_gpdwf_force.cc
@@ -202,7 +202,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
   std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "predict dS    "<< dSpred <<std::endl;
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
 #endif
diff --git a/tests/forces/Test_gpdwf_force_1f_2f.cc b/tests/forces/Test_gpdwf_force_1f_2f.cc
index bfe66cc4..878e77ba 100644
--- a/tests/forces/Test_gpdwf_force_1f_2f.cc
+++ b/tests/forces/Test_gpdwf_force_1f_2f.cc
@@ -43,7 +43,7 @@ void copyConjGauge(LatticeGaugeFieldD &Umu_1f, const LatticeGaugeFieldD &Umu_2f,
 
   int L_2f = UGrid_2f->FullDimensions()[nu];
   int L_1f = UGrid_1f->FullDimensions()[nu]; 
-  assert(L_1f == 2 * L_2f);
+  GRID_ASSERT(L_1f == 2 * L_2f);
 
   //Coordinate grid for reference
   LatticeInteger xcoor_1f(UGrid_1f);
@@ -73,7 +73,7 @@ void convertFermion1f_from_2f(FermionField1f &out_1f, const FermionField2f &in_2
 
   Integer L_2f = FGrid_2f->FullDimensions()[nu+nuoff];
   Integer L_1f = FGrid_1f->FullDimensions()[nu+nuoff];
-  assert(L_1f == 2 * L_2f);
+  GRID_ASSERT(L_1f == 2 * L_2f);
   
   auto in_f0_2fgrid = PeekIndex<GparityFlavourIndex>(in_2f,0); //flavor 0 on 2f Grid
   FermionField1f in_f0_1fgrid(FGrid_1f);
@@ -442,7 +442,7 @@ int main (int argc, char ** argv)
   }else if(action == "DSDR"){
     runTest<GparityWilsonTMFermionD, WilsonTMFermionD>(argc,argv);
   }else{
-    assert(0);
+    GRID_ASSERT(0);
   }
 }
 
diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc
index 8bd26a35..436ee6b0 100644
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@@ -152,7 +152,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "predict dS    "<< dSpred <<std::endl;
 
-  assert( fabs(real(Sprime-S-dSpred)) < 2.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 2.0 ) ;
 
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_laplacian_force.cc b/tests/forces/Test_laplacian_force.cc
index dbaf1cbd..9352204d 100644
--- a/tests/forces/Test_laplacian_force.cc
+++ b/tests/forces/Test_laplacian_force.cc
@@ -166,7 +166,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "pred dS "<< dSpred <<std::endl; 
 
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
 
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_mobius_force.cc b/tests/forces/Test_mobius_force.cc
index 3518007c..081bcbaf 100644
--- a/tests/forces/Test_mobius_force.cc
+++ b/tests/forces/Test_mobius_force.cc
@@ -230,7 +230,7 @@ int main (int argc, char ** argv)
 
   std::cout << GridLogMessage << "dS - dt^2 term "<< Hmomprime - Hmom + Sprime - S - dSm2 <<std::endl;
 
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
 
 
   
diff --git a/tests/forces/Test_mobius_force_eofa.cc b/tests/forces/Test_mobius_force_eofa.cc
index a8871faa..a6d4eee8 100644
--- a/tests/forces/Test_mobius_force_eofa.cc
+++ b/tests/forces/Test_mobius_force_eofa.cc
@@ -111,8 +111,8 @@ int main (int argc, char** argv)
     
     std::cout << "(phi, Mphi) - (eta,eta): " << test << "  expect 0" << std::endl;
 
-    assert(test.real() < 1e-8);
-    assert(test.imag() < 1e-8);
+    GRID_ASSERT(test.real() < 1e-8);
+    GRID_ASSERT(test.imag() < 1e-8);
 
     //Another test is to use heatbath twice to apply M^{-1/2} to Phi then apply M
     // M  Phi' 
@@ -126,7 +126,7 @@ int main (int argc, char** argv)
     test2  = test2 - eta;
     RealD test2_norm = norm2(test2);
     std::cout << "|M M^{-1/2} M^{-1/2} eta - eta|^2 = " << test2_norm << " expect 0" << std::endl;
-    assert( test2_norm < 1e-8 );
+    GRID_ASSERT( test2_norm < 1e-8 );
   }
 
 
@@ -209,7 +209,7 @@ int main (int argc, char** argv)
   printf("real(dS_predict) = %1.15e\n", dSpred.real());
   printf("imag(dS_predict) = %1.15e\n\n", dSpred.imag());
 
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
 
   std::cout << GridLogMessage << "Done" << std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_mobius_gparity_eofa_mixed.cc b/tests/forces/Test_mobius_gparity_eofa_mixed.cc
index 5f141f5c..dbc73ce3 100644
--- a/tests/forces/Test_mobius_gparity_eofa_mixed.cc
+++ b/tests/forces/Test_mobius_gparity_eofa_mixed.cc
@@ -98,7 +98,7 @@ NAMESPACE_BEGIN(Grid);
       std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
 
       SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      GRID_ASSERT(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
       precisionChange(FermOpF.Umu, FermOpD.Umu);
 
@@ -210,14 +210,14 @@ int main (int argc, char** argv)
   
   std::cout << GridLogMessage << "Phi(double)=" << norm2(MeofaD.getPhi()) << " Phi(mixed)=" << norm2(MeofaMx.getPhi()) << " diff=" << n << std::endl;
 
-  assert(n < 1e-8);
+  GRID_ASSERT(n < 1e-8);
 
   RealD Sd = MeofaD.S(Ud);
   RealD Smx = MeofaMx.S(Ud);
 
   std::cout << GridLogMessage << "Initial action double=" << Sd << " mixed=" << Smx << " diff=" << Sd-Smx << std::endl;
 
-  assert(fabs(Sd-Smx) < 1e-6);
+  GRID_ASSERT(fabs(Sd-Smx) < 1e-6);
 
   SU<Nc>::HotConfiguration(RNG4,Ud);
   precisionChange(Uf, Ud);
@@ -227,7 +227,7 @@ int main (int argc, char** argv)
 
   std::cout << GridLogMessage << "After randomizing U, action double=" << Sd << " mixed=" << Smx << " diff=" << Sd-Smx << std::endl;
 
-  assert(fabs(Sd-Smx) < 1e-6);
+  GRID_ASSERT(fabs(Sd-Smx) < 1e-6);
 
   std::cout << GridLogMessage << "Done" << std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_mobius_gpforce_eofa.cc b/tests/forces/Test_mobius_gpforce_eofa.cc
index c0b7117a..c668780a 100644
--- a/tests/forces/Test_mobius_gpforce_eofa.cc
+++ b/tests/forces/Test_mobius_gpforce_eofa.cc
@@ -167,7 +167,7 @@ int main (int argc, char** argv)
   printf("real(dS_predict) = %1.15e\n", dSpred.real());
   printf("imag(dS_predict) = %1.15e\n\n", dSpred.imag());
 
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
 
   std::cout << GridLogMessage << "Done" << std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_partfrac_force.cc b/tests/forces/Test_partfrac_force.cc
index 173f7626..7d6fc2c2 100644
--- a/tests/forces/Test_partfrac_force.cc
+++ b/tests/forces/Test_partfrac_force.cc
@@ -149,7 +149,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "predict dS    "<< dSpred <<std::endl;
 
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0e-2 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0e-2 ) ;
 
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_rect_force.cc b/tests/forces/Test_rect_force.cc
index e40cb5fd..8cedff32 100644
--- a/tests/forces/Test_rect_force.cc
+++ b/tests/forces/Test_rect_force.cc
@@ -123,7 +123,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "pred dS "<< dSpred <<std::endl;
 
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0e-2 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0e-2 ) ;
 
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc
index f4bf8ed3..da3ba5e2 100644
--- a/tests/forces/Test_wilson_force.cc
+++ b/tests/forces/Test_wilson_force.cc
@@ -205,7 +205,7 @@ int main (int argc, char ** argv)
 
   std::cout << GridLogMessage << "Total dS    "<< Hmomprime - Hmom + Sprime - S <<std::endl;
 
-  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
 
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_wilsonclover_force.cc b/tests/forces/Test_wilsonclover_force.cc
index 8aa5eb9d..8d55665f 100644
--- a/tests/forces/Test_wilsonclover_force.cc
+++ b/tests/forces/Test_wilsonclover_force.cc
@@ -188,7 +188,7 @@ int main(int argc, char **argv)
 
   std::cout << GridLogMessage << "Total dS    " << Hmomprime - Hmom + Sprime - S << std::endl;
 
-  assert(fabs(real(Sprime - S - dSpred)) < 1.0);
+  GRID_ASSERT(fabs(real(Sprime - S - dSpred)) < 1.0);
 
   std::cout << GridLogMessage << "Done" << std::endl;
   Grid_finalize();
diff --git a/tests/forces/Test_zmobius_force.cc b/tests/forces/Test_zmobius_force.cc
index 5d3a86f4..00ed75a3 100644
--- a/tests/forces/Test_zmobius_force.cc
+++ b/tests/forces/Test_zmobius_force.cc
@@ -163,7 +163,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
   std::cout << GridLogMessage << "predict dS    "<< dSpred <<std::endl;
 
-  assert( fabs(real(Sprime-S-dSpred)) < 3.0 ) ;
+  GRID_ASSERT( fabs(real(Sprime-S-dSpred)) < 3.0 ) ;
 
   std::cout<< GridLogMessage << "Done" <<std::endl;
   Grid_finalize();
diff --git a/tests/hmc/Test_action_dwf_gparity2fvs1f.cc b/tests/hmc/Test_action_dwf_gparity2fvs1f.cc
index 1c9d01fa..304b3aba 100644
--- a/tests/hmc/Test_action_dwf_gparity2fvs1f.cc
+++ b/tests/hmc/Test_action_dwf_gparity2fvs1f.cc
@@ -54,7 +54,7 @@ void copy2fTo1fFermionField(FermionField1f &out, const FermionField2f &in, int g
   std::cout << "dim_2f " << dim_2f << std::endl;
   std::cout << "dim_1f " << dim_1f << std::endl;
   
-  assert(dim_1f[gpdir] == 2*dim_2f[gpdir]);
+  GRID_ASSERT(dim_1f[gpdir] == 2*dim_2f[gpdir]);
 
   LatticeInteger xcoor_1f(out.Grid()); //5d lattice integer
   LatticeCoordinate(xcoor_1f,gpdir);
diff --git a/tests/hmc/Test_multishift_sqrt.cc b/tests/hmc/Test_multishift_sqrt.cc
index 31697c12..8063ef5e 100644
--- a/tests/hmc/Test_multishift_sqrt.cc
+++ b/tests/hmc/Test_multishift_sqrt.cc
@@ -139,7 +139,7 @@ int main (int argc, char ** argv)
   //  for(int n=0;n<poles.size();n++){
   //    a = a + residues[n]/(x+poles[n]);
   //  }
-  assert(Sqrt.order==degree);
+  GRID_ASSERT(Sqrt.order==degree);
 
   combined = Sqrt.norm*src;
   for(int i=0;i<degree;i++){
diff --git a/tests/hmc/Test_remez.cc b/tests/hmc/Test_remez.cc
index ff20e7f4..5042617a 100644
--- a/tests/hmc/Test_remez.cc
+++ b/tests/hmc/Test_remez.cc
@@ -110,10 +110,10 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "x^(1/4) : "<<ssx<<" "<<assx<<std::endl;
   std::cout<<GridLogMessage << "x^(-1/2): "<<isx<<" "<<aisx<<std::endl;
   std::cout<<GridLogMessage << "x^(-1/4): "<<issx<<" "<<aissx<<std::endl;
-  assert(fabs(sx-asx)<1.0e-6);
-  assert(fabs(ssx-assx)<1.0e-6);
-  assert(fabs(isx-aisx)<1.0e-6);
-  assert(fabs(issx-aissx)<1.0e-6);
+  GRID_ASSERT(fabs(sx-asx)<1.0e-6);
+  GRID_ASSERT(fabs(ssx-assx)<1.0e-6);
+  GRID_ASSERT(fabs(isx-aisx)<1.0e-6);
+  GRID_ASSERT(fabs(issx-aissx)<1.0e-6);
 
   Grid_finalize();
 }
diff --git a/tests/lanczos/Test_compressed_lanczos.cc b/tests/lanczos/Test_compressed_lanczos.cc
index 28df3f99..12afcd56 100644
--- a/tests/lanczos/Test_compressed_lanczos.cc
+++ b/tests/lanczos/Test_compressed_lanczos.cc
@@ -57,7 +57,7 @@ public:
 
   void checkpointFine(std::string evecs_file,std::string evals_file)
   {
-    assert(this->subspace.size()==nbasis);
+    GRID_ASSERT(this->subspace.size()==nbasis);
     emptyUserRecord record;
     Grid::ScidacWriter WR(this->_FineGrid->IsBoss());
     WR.open(evecs_file);
@@ -79,7 +79,7 @@ public:
     XmlReader RDx(evals_file);
     read(RDx,"evals",this->evals_fine);
     
-    assert(this->evals_fine.size()==nbasis);
+    GRID_ASSERT(this->evals_fine.size()==nbasis);
     
     std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
     emptyUserRecord record;
@@ -117,7 +117,7 @@ public:
     XmlReader RDx(evals_file);
     read(RDx,"evals",this->evals_coarse);
 
-    assert(this->evals_coarse.size()==nvec);
+    GRID_ASSERT(this->evals_coarse.size()==nvec);
     emptyUserRecord record;
     std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
     Grid::ScidacReader RD ;
@@ -163,18 +163,18 @@ int main (int argc, char ** argv) {
 
   auto fineLatt     = GridDefaultLatt();
   int dims=fineLatt.size();
-  assert(blockSize.size()==dims+1);
+  GRID_ASSERT(blockSize.size()==dims+1);
   Coordinate coarseLatt(dims);
 
   for (int d=0;d<coarseLatt.size();d++){
-    coarseLatt[d] = fineLatt[d]/blockSize[d];    assert(coarseLatt[d]*blockSize[d]==fineLatt[d]);
+    coarseLatt[d] = fineLatt[d]/blockSize[d];    GRID_ASSERT(coarseLatt[d]*blockSize[d]==fineLatt[d]);
   }
 
   std::cout << GridLogMessage<< " 5d coarse lattice is ";
   for (int i=0;i<coarseLatt.size();i++){
     std::cout << coarseLatt[i]<<"x";
   } 
-  int cLs = Ls/blockSize[dims]; assert(cLs*blockSize[dims]==Ls);
+  int cLs = Ls/blockSize[dims]; GRID_ASSERT(cLs*blockSize[dims]==Ls);
   std::cout << cLs<<std::endl;
   
   GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
@@ -201,14 +201,14 @@ int main (int argc, char ** argv) {
 
   std::cout << GridLogMessage << "Keep " << fine.Nstop   << " fine   vectors" << std::endl;
   std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl;
-  assert(Nm2 >= Nm1);
+  GRID_ASSERT(Nm2 >= Nm1);
 
   const int nbasis= 60;
-  assert(nbasis==Ns1);
+  GRID_ASSERT(nbasis==Ns1);
   LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,HermOp,Odd);
   std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
 
-  assert( (Params.doFine)||(Params.doFineRead));
+  GRID_ASSERT( (Params.doFine)||(Params.doFineRead));
 
   if ( Params.doFine ) { 
     std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "<<Nk1<<" Nm "<<Nm1<< std::endl;
diff --git a/tests/lanczos/Test_compressed_lanczos_gparity.cc b/tests/lanczos/Test_compressed_lanczos_gparity.cc
index d5e09c0b..557fe060 100644
--- a/tests/lanczos/Test_compressed_lanczos_gparity.cc
+++ b/tests/lanczos/Test_compressed_lanczos_gparity.cc
@@ -100,7 +100,7 @@ public:
 
   void checkpointFine(std::string evecs_file,std::string evals_file)
   {
-    assert(this->subspace.size()==nbasis);
+    GRID_ASSERT(this->subspace.size()==nbasis);
     emptyUserRecord record;
     Grid::ScidacWriter WR(this->_FineGrid->IsBoss());
     WR.open(evecs_file);
@@ -122,7 +122,7 @@ public:
     XmlReader RDx(evals_file);
     read(RDx,"evals",this->evals_fine);
 
-    if(this->evals_fine.size() < nbasis) assert(0 && "Not enough fine evals to complete basis");
+    if(this->evals_fine.size() < nbasis) GRID_ASSERT(0 && "Not enough fine evals to complete basis");
     if(this->evals_fine.size() > nbasis){ //allow the use of precomputed evecs with a larger #evecs
       std::cout << GridLogMessage << "Truncating " << this->evals_fine.size() << " evals to basis size " << nbasis << std::endl;
       this->evals_fine.resize(nbasis);
@@ -164,7 +164,7 @@ public:
     XmlReader RDx(evals_file);
     read(RDx,"evals",this->evals_coarse);
 
-    assert(this->evals_coarse.size()==nvec);
+    GRID_ASSERT(this->evals_coarse.size()==nvec);
     emptyUserRecord record;
     std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
     Grid::ScidacReader RD ;
@@ -252,7 +252,7 @@ void runTest(const Options &opt){
   GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(opt.Ls,UGrid);
 
   //Setup G-parity BCs
-  assert(Nd == 4);
+  GRID_ASSERT(Nd == 4);
   std::vector<int> dirs4(4);
   for(int i=0;i<3;i++) dirs4[i] = opt.GparityDirs[i];
   dirs4[3] = 0; //periodic gauge BC in time
@@ -273,14 +273,14 @@ void runTest(const Options &opt){
   auto fineLatt     = GridDefaultLatt();
   Coordinate coarseLatt(4);
   for (int d=0;d<4;d++){
-    coarseLatt[d] = fineLatt[d]/opt.blockSize[d];    assert(coarseLatt[d]*opt.blockSize[d]==fineLatt[d]);
+    coarseLatt[d] = fineLatt[d]/opt.blockSize[d];    GRID_ASSERT(coarseLatt[d]*opt.blockSize[d]==fineLatt[d]);
   }
 
   std::cout << GridLogMessage<< " 5d coarse lattice is ";
   for (int i=0;i<4;i++){
     std::cout << coarseLatt[i]<<"x";
   } 
-  int cLs = opt.Ls/opt.blockSize[4]; assert(cLs*opt.blockSize[4]==opt.Ls);
+  int cLs = opt.Ls/opt.blockSize[4]; GRID_ASSERT(cLs*opt.blockSize[4]==opt.Ls);
   std::cout << cLs<<std::endl;
   
   GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
@@ -304,9 +304,9 @@ void runTest(const Options &opt){
 
   std::cout << GridLogMessage << "Keep " << fine.N_true_get   << " fine   vectors" << std::endl;
   std::cout << GridLogMessage << "Keep " << coarse.N_true_get << " coarse vectors" << std::endl;
-  assert(coarse.N_true_get >= fine.N_true_get);
+  GRID_ASSERT(coarse.N_true_get >= fine.N_true_get);
 
-  assert(nbasis<=fine.N_true_get);
+  GRID_ASSERT(nbasis<=fine.N_true_get);
   LocalCoherenceLanczosScidac<SiteSpinor,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,SchurOp,Odd);
   std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
  
@@ -411,7 +411,7 @@ int main (int argc, char ** argv)
   }
   opt.config = argv[1];
   GridCmdOptionIntVector(argv[2], opt.GparityDirs);
-  assert(opt.GparityDirs.size() == 3);
+  GRID_ASSERT(opt.GparityDirs.size() == 3);
 
   for(int i=3;i<argc;i++){
     std::string sarg = argv[i];
@@ -423,7 +423,7 @@ int main (int argc, char ** argv)
       std::cout << GridLogMessage << "Set quark mass to " << opt.mass << std::endl;
     }else if(sarg == "--block"){
       GridCmdOptionIntVector(argv[i+1], opt.blockSize);
-      assert(opt.blockSize.size() == 5);
+      GRID_ASSERT(opt.blockSize.size() == 5);
       std::cout << GridLogMessage << "Set block size to ";
       for(int q=0;q<5;q++) std::cout << opt.blockSize[q] << " ";
       std::cout << std::endl;      
@@ -480,7 +480,7 @@ int main (int argc, char ** argv)
     runTest<350>(opt); break;
   default:
     std::cout << GridLogMessage << "Unsupported basis size " << basis_size << std::endl;
-    assert(0);
+    GRID_ASSERT(0);
   }
   
   Grid_finalize();
diff --git a/tests/lanczos/Test_dwf_G5R5.cc b/tests/lanczos/Test_dwf_G5R5.cc
index f6b50531..7b4ba79f 100644
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@@ -392,7 +392,7 @@ int main(int argc, char** argv) {
     }
   }
 
-  FILE *fp = fopen("lego-plot.py","w"); assert(fp!=NULL);
+  FILE *fp = fopen("lego-plot.py","w"); GRID_ASSERT(fp!=NULL);
 #define PYTHON_LINE(A)  fprintf(fp,A"\n");
   PYTHON_LINE("import matplotlib.pyplot as plt");
   PYTHON_LINE("import numpy as np");
diff --git a/tests/lanczos/Test_dwf_block_lanczos.cc b/tests/lanczos/Test_dwf_block_lanczos.cc
index 671f2fa6..64931417 100644
--- a/tests/lanczos/Test_dwf_block_lanczos.cc
+++ b/tests/lanczos/Test_dwf_block_lanczos.cc
@@ -95,13 +95,13 @@ void CmdJobParams::Parse(char **argv,int argc)
   if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
     arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
     pfile.open(arg);
-    assert(pfile);
+    GRID_ASSERT(pfile);
     expect = 0;
     while( pfile >> vstr ) {
       if ( vstr.compare("boundary_phase") == 0 ) {
         pfile >> vstr;
         GridCmdOptionInt(vstr,idx);
-        assert(expect==idx);
+        GRID_ASSERT(expect==idx);
         pfile >> vstr;
         GridCmdOptionFloat(vstr,re);
         pfile >> vstr;
@@ -118,13 +118,13 @@ void CmdJobParams::Parse(char **argv,int argc)
   if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
     arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
     pfile.open(arg);
-    assert(pfile);
+    GRID_ASSERT(pfile);
     Ls = 0;
     while( pfile >> vstr ) {
       if ( vstr.compare("omega") == 0 ) {
         pfile >> vstr;
         GridCmdOptionInt(vstr,idx);
-        assert(Ls==idx);
+        GRID_ASSERT(Ls==idx);
         pfile >> vstr;
         GridCmdOptionFloat(vstr,re);
         pfile >> vstr;
@@ -324,7 +324,7 @@ int main (int argc, char ** argv)
     std::cout << GridLogMessage  << "mpi_layout= " << mpi_layout << std::endl;
     std::cout << GridLogMessage  << "mpi_split= " << mpi_split << std::endl;
     std::cout << GridLogMessage  << "mrhs= " << mrhs << std::endl;
-//    assert(JP.Nu==tmp);
+//    GRID_ASSERT(JP.Nu==tmp);
 
   /////////////////////////////////////////////
   // Split into 1^4 mpi communicators, keeping it explicitly single
diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
index 7a84a465..973e9159 100644
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
@@ -57,7 +57,7 @@ public:
 
   void checkpointFine(std::string evecs_file,std::string evals_file)
   {
-    assert(this->subspace.size()==nbasis);
+    GRID_ASSERT(this->subspace.size()==nbasis);
     emptyUserRecord record;
     Grid::ScidacWriter WR(this->_FineGrid->IsBoss());
     WR.open(evecs_file);
@@ -79,7 +79,7 @@ public:
     XmlReader RDx(evals_file);
     read(RDx,"evals",this->evals_fine);
     
-    assert(this->evals_fine.size()==nbasis);
+    GRID_ASSERT(this->evals_fine.size()==nbasis);
     
     std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
     emptyUserRecord record;
@@ -116,7 +116,7 @@ public:
     XmlReader RDx(evals_file);
     read(RDx,"evals",this->evals_coarse);
 
-    assert(this->evals_coarse.size()==nvec);
+    GRID_ASSERT(this->evals_coarse.size()==nvec);
     emptyUserRecord record;
     std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
     Grid::ScidacReader RD ;
@@ -162,19 +162,19 @@ int main (int argc, char ** argv) {
 
   Coordinate fineLatt     = GridDefaultLatt();
   int dims=fineLatt.size();
-  assert(blockSize.size()==dims+1);
+  GRID_ASSERT(blockSize.size()==dims+1);
   Coordinate coarseLatt(dims);
   Coordinate coarseLatt5d ;
 
   for (int d=0;d<coarseLatt.size();d++){
-    coarseLatt[d] = fineLatt[d]/blockSize[d];    assert(coarseLatt[d]*blockSize[d]==fineLatt[d]);
+    coarseLatt[d] = fineLatt[d]/blockSize[d];    GRID_ASSERT(coarseLatt[d]*blockSize[d]==fineLatt[d]);
   }
 
   std::cout << GridLogMessage<< " 5d coarse lattice is ";
   for (int i=0;i<coarseLatt.size();i++){
     std::cout << coarseLatt[i]<<"x";
   } 
-  int cLs = Ls/blockSize[dims]; assert(cLs*blockSize[dims]==Ls);
+  int cLs = Ls/blockSize[dims]; GRID_ASSERT(cLs*blockSize[dims]==Ls);
   std::cout << cLs<<std::endl;
   
   GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
@@ -201,14 +201,14 @@ int main (int argc, char ** argv) {
 
   std::cout << GridLogMessage << "Keep " << fine.Nstop   << " fine   vectors" << std::endl;
   std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl;
-  assert(Nm2 >= Nm1);
+  GRID_ASSERT(Nm2 >= Nm1);
 
   const int nbasis= 60;
-  assert(nbasis==Ns1);
+  GRID_ASSERT(nbasis==Ns1);
   LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,HermOp,Odd);
   std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
 
-  assert( (Params.doFine)||(Params.doFineRead));
+  GRID_ASSERT( (Params.doFine)||(Params.doFineRead));
 
   if ( Params.doFine ) { 
     std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "<<Nk1<<" Nm "<<Nm1<< std::endl;
diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
index e82a9741..682b3a8a 100644
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc
@@ -159,7 +159,7 @@ public:
   void calcFine(RealD alpha, RealD beta,int Npoly,int Nm,RealD resid, 
 		RealD MaxIt, RealD betastp, int MinRes)
   {
-    assert(nbasis<=Nm);
+    GRID_ASSERT(nbasis<=Nm);
     Chebyshev<FineField>      Cheby(alpha,beta,Npoly);
     FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
     PlainHermOp<FineField>    Op(_FineOp);
@@ -269,19 +269,19 @@ int main (int argc, char ** argv) {
 
   Coordinate fineLatt     = GridDefaultLatt();
   int dims=fineLatt.size();
-  assert(blockSize.size()==dims+1);
+  GRID_ASSERT(blockSize.size()==dims+1);
   Coordinate coarseLatt(dims);
   Coordinate coarseLatt5d ;
 
   for (int d=0;d<coarseLatt.size();d++){
-    coarseLatt[d] = fineLatt[d]/blockSize[d];    assert(coarseLatt[d]*blockSize[d]==fineLatt[d]);
+    coarseLatt[d] = fineLatt[d]/blockSize[d];    GRID_ASSERT(coarseLatt[d]*blockSize[d]==fineLatt[d]);
   }
 
   std::cout << GridLogMessage<< " 5d coarse lattice is ";
   for (int i=0;i<coarseLatt.size();i++){
     std::cout << coarseLatt[i]<<"x";
   } 
-  int cLs = Ls/blockSize[dims]; assert(cLs*blockSize[dims]==Ls);
+  int cLs = Ls/blockSize[dims]; GRID_ASSERT(cLs*blockSize[dims]==Ls);
   std::cout << cLs<<std::endl;
   
   GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
@@ -312,7 +312,7 @@ int main (int argc, char ** argv) {
 
   std::cout << GridLogMessage << "Keep " << fine.Nk   << " full vectors" << std::endl;
   std::cout << GridLogMessage << "Keep " << coarse.Nk << " total vectors" << std::endl;
-  assert(Nm2 >= Nm1);
+  GRID_ASSERT(Nm2 >= Nm1);
 
   const int nbasis= 32;
   CoarseFineIRL<vSpinColourVector,vTComplex,nbasis> IRL(FrbGrid,CoarseGrid5rb,HermOp,Odd);
diff --git a/tests/lanczos/Test_evec_compression.cc b/tests/lanczos/Test_evec_compression.cc
index 5ba1597c..b3b937d7 100644
--- a/tests/lanczos/Test_evec_compression.cc
+++ b/tests/lanczos/Test_evec_compression.cc
@@ -96,7 +96,7 @@ public:
 		GridBase *FineGrid,
 		GridBase *CoarseGrid){
     int nevecs = evecs_in.size();
-    assert(nevecs > nbasis);
+    GRID_ASSERT(nevecs > nbasis);
     
     //Construct the basis
     basis.resize(nbasis, FineGrid);
@@ -273,7 +273,7 @@ struct Args{
 
 GparityWilsonImplD::ImplParams setupGparityParams(const std::vector<int> &GparityDirs){
   //Setup G-parity BCs
-  assert(Nd == 4);
+  GRID_ASSERT(Nd == 4);
   std::vector<int> dirs4(4);
   for(int i=0;i<3;i++) dirs4[i] = GparityDirs[i];
   dirs4[3] = 0; //periodic gauge BC in time
@@ -309,14 +309,14 @@ void run_b(ActionType &action, const std::string &config, const Args &args){
   auto fineLatt     = GridDefaultLatt();
   Coordinate coarseLatt(4);
   for (int d=0;d<4;d++){
-    coarseLatt[d] = fineLatt[d]/args.blockSize[d];    assert(coarseLatt[d]*args.blockSize[d]==fineLatt[d]);
+    coarseLatt[d] = fineLatt[d]/args.blockSize[d];    GRID_ASSERT(coarseLatt[d]*args.blockSize[d]==fineLatt[d]);
   }
 
   std::cout << GridLogMessage<< " 5d coarse lattice is ";
   for (int i=0;i<4;i++){
     std::cout << coarseLatt[i]<<"x";
   } 
-  int cLs = args.Ls/args.blockSize[4]; assert(cLs*args.blockSize[4]==args.Ls);
+  int cLs = args.Ls/args.blockSize[4]; GRID_ASSERT(cLs*args.blockSize[4]==args.Ls);
   std::cout << cLs<<std::endl;
   
   GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
@@ -350,7 +350,7 @@ void run_b(ActionType &action, const std::string &config, const Args &args){
     XmlReader RDx(evals_file);
     read(RDx,"evals",evals);
     
-    assert(evals.size()==fine.N_true_get);
+    GRID_ASSERT(evals.size()==fine.N_true_get);
     
     std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
     emptyUserRecord record;
@@ -369,7 +369,7 @@ void run_b(ActionType &action, const std::string &config, const Args &args){
     RealD resid = fine.stop_rsd;
     int MaxIt = fine.maxits;
     
-    assert(nbasis<=Nm);    
+    GRID_ASSERT(nbasis<=Nm);    
     Chebyshev<FermionField>      Cheby(fine.getChebyParams());
     FunctionHermOp<FermionField> ChebyOp(Cheby,SchurOp);
     PlainHermOp<FermionField>    Op(SchurOp);
@@ -386,7 +386,7 @@ void run_b(ActionType &action, const std::string &config, const Args &args){
 
     int Nconv;
     IRL.calc(evals, evecs,src,Nconv,false);
-    if(Nconv < Nstop) assert(0 && "Fine lanczos failed to converge the required number of evecs"); //algorithm doesn't consider this a failure
+    if(Nconv < Nstop) GRID_ASSERT(0 && "Fine lanczos failed to converge the required number of evecs"); //algorithm doesn't consider this a failure
     if(Nconv > Nstop){
       //Yes this potentially throws away some evecs but it is better than having a random number of evecs between Nstop and Nm!
       evals.resize(Nstop);
@@ -430,7 +430,7 @@ void run_b(ActionType &action, const std::string &config, const Args &args){
   Chebyshev<FermionField> smoother(fine.getChebyParams());
   
   //Test the quality of the uncompressed evecs
-  assert( compressor.testCompression(SchurOp, smoother, basis, compressed_evecs, evals, fine.stop_rsd, args.coarse_relax_tol) );   
+  GRID_ASSERT( compressor.testCompression(SchurOp, smoother, basis, compressed_evecs, evals, fine.stop_rsd, args.coarse_relax_tol) );   
 }
 
 template<typename ActionType>
@@ -453,7 +453,7 @@ void run(ActionType &action, const std::string &config, const Args &args){
   case 400:
     return run_b<400>(action,config,args);
   default:
-    assert(0 && "Unsupported basis size: allowed values are 50,100,200,250,300,350,400");
+    GRID_ASSERT(0 && "Unsupported basis size: allowed values are 50,100,200,250,300,350,400");
   }
 }
 
@@ -489,7 +489,7 @@ int main (int argc, char ** argv) {
 
   Args args;
   GridCmdOptionIntVector(argv[2], args.GparityDirs);
-  assert(args.GparityDirs.size() == 3);
+  GRID_ASSERT(args.GparityDirs.size() == 3);
 
   std::string action_s = "Mobius"; 
   
@@ -503,7 +503,7 @@ int main (int argc, char ** argv) {
       std::cout << GridLogMessage << "Set quark mass to " << args.mass << std::endl;
     }else if(sarg == "--block"){
       GridCmdOptionIntVector(argv[i+1], args.blockSize);
-      assert(args.blockSize.size() == 5);
+      GRID_ASSERT(args.blockSize.size() == 5);
       std::cout << GridLogMessage << "Set block size to ";
       for(int q=0;q<5;q++) std::cout << args.blockSize[q] << " ";
       std::cout << std::endl;      
@@ -567,7 +567,7 @@ int main (int argc, char ** argv) {
       run(action, config, args);	    
     }
 #else
-    assert(0);
+    GRID_ASSERT(0);
 #endif
   }else{
     WilsonImplD::ImplParams Params = setupParams();
diff --git a/tests/lanczos/Test_wilson_DWFKernel.cc b/tests/lanczos/Test_wilson_DWFKernel.cc
index ab60d780..edc7fe1d 100644
--- a/tests/lanczos/Test_wilson_DWFKernel.cc
+++ b/tests/lanczos/Test_wilson_DWFKernel.cc
@@ -71,23 +71,23 @@ public:
 
   // Support for coarsening to a multigrid
   void OpDiag (const Field &in, Field &out) {
-    assert(0);
+    GRID_ASSERT(0);
     _Mat.Mdiag(in,out);
   }
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
+    GRID_ASSERT(0);
     _Mat.Mdir(in,out,dir,disp);
   }
   void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
+    GRID_ASSERT(0);
     _Mat.MdirAll(in,out);
   };
   void Op     (const Field &in, Field &out){
-    assert(0);
+    GRID_ASSERT(0);
     _Mat.M(in,out);
   }
   void AdjOp     (const Field &in, Field &out){
-    assert(0);
+    GRID_ASSERT(0);
     _Mat.Mdag(in,out);
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
diff --git a/tests/lanczos/Test_wilson_bilanczos.cc b/tests/lanczos/Test_wilson_bilanczos.cc
new file mode 100644
index 00000000..bda7a842
--- /dev/null
+++ b/tests/lanczos/Test_wilson_bilanczos.cc
@@ -0,0 +1,371 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+// copied here from Test_general_coarse_pvdagm.cc
+
+#include <cstdlib>
+
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+namespace Grid {
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass , 
+		  		RealD, mstep , 
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
+                                Integer, ReadEvec,
+                                Integer, maxIter,
+	  			RealD, resid,
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+
+  LanczosParameters() {
+    ////////////////////////////// Default values
+      mass = 0;
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass > 
+  void initialize(Reader<ReaderClass> &TheReader){
+//    std::cout << GridLogMessage << "Reading HMC\n";
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+  
+};
+
+}
+
+template <class T> void writeFile(T& in, std::string const fname){
+#if 1
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}
+
+
+typedef WilsonFermionD WilsonOp;
+typedef typename WilsonFermionD::FermionField FermionField;
+
+template<class Matrix,class Field>
+class InvertNonHermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD _stp;
+public:
+  InvertNonHermitianLinearOperator(Matrix &Mat,RealD stp=1e-8): _Mat(Mat),_stp(stp){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+//    _Mat.Mdiag(in,out);
+//    out = out + shift*in;
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+//    _Mat.Mdir(in,out,dir,disp);
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+//    _Mat.MdirAll(in,out);
+    assert(0);
+  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _Mat.Mdag(in,tmp);
+    MdagMLinearOperator<Matrix,Field> HermOp(_Mat);
+    ConjugateGradient<Field> CG(_stp,10000);
+    CG(HermOp,tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+//    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};
+
+template<class Field>
+void testSchurFromHess(Arnoldi<Field>& Arn, Field& src, int Nlarge, int Nm, int Nk) {
+
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout << GridLogMessage << "Testing Schur reordering, Nm = " << Nm << ", Nk = " << Nk << std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+
+  std::cout << GridLogMessage << "Running Arnoldi for 1 iteration to get a Hessenberg." << std::endl;
+  Arn(src, 1, Nlarge, Nm, Nlarge);
+  Eigen::MatrixXcd Hess = Arn.getHessenbergMat();
+  std::cout << GridLogMessage << "Hessenberg for use: " << std::endl << Hess << std::endl;
+
+  ComplexSchurDecomposition schur (Hess, true);
+  bool isDecomposed = schur.checkDecomposition();
+  std::cout << "Schur decomp holds? " << isDecomposed << std::endl;
+
+  std::cout << GridLogMessage << "S = " << std::endl << schur.getMatrixS() << std::endl;
+  std::cout << GridLogMessage << "Swapping S(3, 3) with S(4, 4)" << std::endl;
+  schur.swapEvals(3);
+  std::cout << GridLogMessage << "S after swap = " << std::endl << schur.getMatrixS() << std::endl;
+  std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
+
+  // Now move last diagonal element all the way to the front.
+  std::cout << GridLogMessage << "Moving last eval to front. S at start = " << std::endl << schur.getMatrixS() << std::endl;
+  for (int i = 0; i < Nk - 1; i++) {
+    int swapIdx = Nk - 2 - i;
+    schur.swapEvals(swapIdx);
+    std::cout << GridLogMessage << "S after swap of index " << swapIdx << " = " << std::endl << schur.getMatrixS() << std::endl;
+    std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
+  }
+
+  std::cout << GridLogMessage << "Testing Schur reorder" << std::endl;
+  schur.schurReorder(Nk);
+  std::cout << GridLogMessage << "S after reorder = " << std::endl << schur.getMatrixS() << std::endl;
+  std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
+
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+//   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+//  std::vector<int> lat_size {32, 32, 32, 32};
+//  std::cout << "Lattice size: " << lat_size << std::endl;
+  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								          GridDefaultSimd(Nd,vComplex::Nsimd()),
+								          GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+//  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+//  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  // poare TODO: replace this with the following line?
+  Coordinate clatt = GridDefaultLatt();
+//   Coordinate clatt = GridDefaultLatt();              // [PO] initial line before I edited it
+  for(int d=0;d<clatt.size();d++){
+  std::cout << GridLogMessage<< clatt[d] <<std::endl;
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("config");
+//  std::string file("Users/patrickoare/libraries/PETSc-Grid/ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  LanczosParameters LanParams;
+  {
+    XmlReader  HMCrd("LanParams.xml");
+    read(HMCrd,"LanczosParameters",LanParams);
+  }
+
+  std::cout << GridLogMessage<< LanParams <<std::endl;
+  {
+    XmlWriter HMCwr("LanParams.xml.out");
+    write(HMCwr,"LanczosParameters",LanParams);
+  }
+
+
+  RealD mass=0.01;
+  RealD M5=1.8;
+
+  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
+  int Nm = 50;
+  int Nk = 12; 
+  int Np = 38; 
+  // int Nk = Nm+1;     // if just running once
+  int maxIter = 10000;
+  int Nstop = 10;
+  RealD resid = 1.0e-5;
+
+  std::vector<Complex> boundary = {1,1,1,-1};
+  WilsonOp::ImplParams Params(boundary);
+
+//  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+//  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  mass=LanParams.mass;
+  std::cout << GridLogIRL<< "mass "<<mass<<std::endl;
+  WilsonOp WilsonOperator(Umu,*UGrid,*UrbGrid,mass,Params);
+
+  // const int nbasis = 20;            // size of approximate basis for low-mode space
+  const int nbasis = 3;            // size of approximate basis for low-mode space
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+//  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+//  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+//  typedef ShiftedComplexPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedComplexPVdagM_t;
+//  PVdagM_t PVdagM(Ddwf, Dpv);
+//  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv);
+//  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
+//  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
+
+
+  NonHermitianLinearOperator<WilsonOp,FermionField> Dwilson(WilsonOperator); /// <-----
+//  InvertNonHermitianLinearOperator<WilsonOp,FermionField> Iwilson(WilsonOperator); /// <-----
+  MdagMLinearOperator<WilsonOp,FermionField> HermOp(WilsonOperator); /// <-----
+  Gamma5HermitianLinearOperator <WilsonOp,LatticeFermion> HermOp2(WilsonOperator); /// <----
+
+  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
+  resid=LanParams.resid;
+  Nstop=LanParams.Nstop;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
+  maxIter=LanParams.maxIter;
+  Nm = Nk + Np;
+  int Nu=16;
+  std::vector<LatticeFermion> src(Nu,FGrid); 
+  for(int i=0;i<Nu;i++) random(RNG5,src[i]);
+
+  if(LanParams.ReadEvec) {
+    std::string evecs_file="evec_in";
+    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD;
+    RD.open(evecs_file);
+    RD.readScidacFieldRecord(src[0],record);
+    RD.close();
+  }
+
+  Coordinate origin ({0,0,0,0});
+  auto tmpSrc = peekSite(src[0], origin);
+  std::cout << "[DEBUG] Source at origin = " <<  tmpSrc << std::endl;
+  LatticeFermion src2 = src[0];
+
+  // Run KrylovSchur and Arnoldi on a Hermitian matrix
+  std::cout << GridLogMessage << "Running Krylov Schur" << std::endl;
+#if 1
+    RealD shift=1.5;
+    KrylovSchur KrySchur (Dwilson, UGrid, resid,EvalImNormSmall);
+    KrySchur(src[0], maxIter, Nm, Nk, Nstop,&shift);
+#else
+    KrylovSchur KrySchur (Iwilson, UGrid, resid,EvalImNormSmall);
+    KrySchur(src[0], maxIter, Nm, Nk, Nstop);
+#endif
+//  std::cout << GridLogMessage << "evec.size= " << KrySchur.evecs.size()<< std::endl;
+  LanczosBidiagonalization<Field> LB(Dwilson, UGrid);
+  LB.run(src[0], Nm, tol);
+
+  src[0]=KrySchur.evecs[0];
+  for (int i=1;i<Nstop;i++) src[0]+=KrySchur.evecs[i];
+  for (int i=0;i<Nstop;i++) 
+  {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
+        auto evdensity = localInnerProduct(KrySchur.evecs[i],KrySchur.evecs[i] );
+        writeFile(evdensity,evfile);
+
+  }
+
+  {
+        std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
+//        auto evdensity = localInnerProduct(evec[i],evec[i] );
+        writeFile(src[0],evfile);
+  }
+
+
+  /*
+  std::cout << GridLogMessage << "Running Arnoldi" << std::endl;
+  // Arnoldi Arn (Dsq, FGrid, 1e-8);
+  Arnoldi Arn (DLinOp, FGrid, 1e-8);
+  testSchurFromHess<LatticeFermion>(Arn, src, 10, 6, 4);
+
+  Arnoldi Arn2 (DLinOp, FGrid, 1e-8);
+  testSchurFromHess<LatticeFermion>(Arn2, src, 16, 12, 8);
+  */
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
diff --git a/tests/qdpxx/Test_qdpxx_munprec.cc b/tests/qdpxx/Test_qdpxx_munprec.cc
index c6ce2800..734aa1db 100644
--- a/tests/qdpxx/Test_qdpxx_munprec.cc
+++ b/tests/qdpxx/Test_qdpxx_munprec.cc
@@ -387,7 +387,7 @@ public:
      Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
      return  M;
    }
-   assert(0);
+   GRID_ASSERT(0);
   }
 
   static Chroma::Handle< Chroma::SystemSolver<QDP::LatticeFermion> > GetSolver(QDP::multi1d<QDP::LatticeColorMatrix> &u, ChromaAction parms)
@@ -745,7 +745,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF
     return;
   }
   
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 
diff --git a/tests/qdpxx/Test_qdpxx_wilson.cc b/tests/qdpxx/Test_qdpxx_wilson.cc
index 8ce28dca..e49b95d6 100644
--- a/tests/qdpxx/Test_qdpxx_wilson.cc
+++ b/tests/qdpxx/Test_qdpxx_wilson.cc
@@ -440,7 +440,7 @@ void calc_grid(ChromaAction action, Grid::LatticeGaugeField &Umu, Grid::LatticeF
     return;
   }
 
-  assert(0);
+  GRID_ASSERT(0);
 }
 
 int main(int argc, char **argv)
diff --git a/tests/smearing/Test_WilsonFlow_adaptive.cc b/tests/smearing/Test_WilsonFlow_adaptive.cc
index 23123eb9..48b64734 100644
--- a/tests/smearing/Test_WilsonFlow_adaptive.cc
+++ b/tests/smearing/Test_WilsonFlow_adaptive.cc
@@ -47,7 +47,7 @@ RealD interpolate(const RealD t_int, const std::vector<std::pair<RealD,RealD> >
     }
     else if(diff < tdiff2){ tdiff2 = diff; t2_idx = i; }
   }
-  assert(t1_idx != -1 && t2_idx != -1);
+  GRID_ASSERT(t1_idx != -1 && t2_idx != -1);
   
   RealD t2 = data[t2_idx].first,  v2 = data[t2_idx].second;
   RealD t1 = data[t1_idx].first,  v1 = data[t1_idx].second;
diff --git a/tests/solver/Test_coarse_even_odd.cc b/tests/solver/Test_coarse_even_odd.cc
index 60e5c372..f333e2c1 100644
--- a/tests/solver/Test_coarse_even_odd.cc
+++ b/tests/solver/Test_coarse_even_odd.cc
@@ -169,7 +169,7 @@ int main(int argc, char** argv) {
     std::cout << GridLogMessage << "norm2(Munprec), norm2(Dhop + Mdiag), abs. deviation, rel. deviation: "
               << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev << " -> check "
               << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
-    assert(relDev <= checkTolerance);
+    GRID_ASSERT(relDev <= checkTolerance);
   }
 
   {
@@ -200,7 +200,7 @@ int main(int argc, char** argv) {
     std::cout << GridLogMessage << "norm2(Dhop), norm2(Meo + Moe), abs. deviation, rel. deviation: "
               << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev
               << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
-    assert(relDev <= checkTolerance);
+    GRID_ASSERT(relDev <= checkTolerance);
   }
 
   {
@@ -222,7 +222,7 @@ int main(int argc, char** argv) {
     std::cout << GridLogMessage << "Re(v^dag M^dag M v), Im(v^dag M^dag M v), rel.deviation: "
               << real(dot) << " " << imag(dot) << " " << relDev
               << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
-    assert(relDev <= checkTolerance);
+    GRID_ASSERT(relDev <= checkTolerance);
   }
 
   {
@@ -242,7 +242,7 @@ int main(int argc, char** argv) {
     std::cout << GridLogMessage << "Re(v^dag Mooee^dag Mooee v), Im(v^dag Mooee^dag Mooee v), rel.deviation: "
               << real(dot) << " " << imag(dot) << " " << relDev
               << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
-    assert(relDev <= checkTolerance);
+    GRID_ASSERT(relDev <= checkTolerance);
   }
 
   {
@@ -262,7 +262,7 @@ int main(int argc, char** argv) {
     std::cout << GridLogMessage << "norm2(src), norm2(MooeeInv Mooee src), abs. deviation, rel. deviation: "
               << norm2(src) << " " << norm2(phi) << " " << absDev << " " << relDev
               << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
-    assert(relDev <= checkTolerance);
+    GRID_ASSERT(relDev <= checkTolerance);
   }
 
   {
@@ -343,7 +343,7 @@ int main(int argc, char** argv) {
     std::cout << GridLogMessage << "norm2(chi), norm2(MeeInv Mee chi), abs. deviation, rel. deviation: "
               << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev
               << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
-    assert(relDev <= checkTolerance);
+    GRID_ASSERT(relDev <= checkTolerance);
   }
 
   {
@@ -380,7 +380,7 @@ int main(int argc, char** argv) {
     std::cout << GridLogMessage << "norm2(chi), norm2(MeeDag MeeInvDag chi), abs. deviation, rel. deviation: "
               << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev
               << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
-    assert(relDev <= checkTolerance);
+    GRID_ASSERT(relDev <= checkTolerance);
   }
 
   {
@@ -429,7 +429,7 @@ int main(int argc, char** argv) {
     std::cout << GridLogMessage << "norm2(Dunprec), norm2(Deoprec), abs. deviation, rel. deviation: "
               << norm2(ref) << " " << norm2(phi) << " " << absDev << " " << relDev
               << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl;
-    assert(relDev <= checkTolerance);
+    GRID_ASSERT(relDev <= checkTolerance);
   }
 
   {
diff --git a/tests/solver/Test_dwf_cg_prec.cc b/tests/solver/Test_dwf_cg_prec.cc
index efbd7fc6..7ebae9d8 100644
--- a/tests/solver/Test_dwf_cg_prec.cc
+++ b/tests/solver/Test_dwf_cg_prec.cc
@@ -95,7 +95,7 @@ int main(int argc, char** argv) {
   GridStopWatch CGTimer;
 
   SchurDiagMooeeOperator<DomainWallFermionD, LatticeFermion> HermOpEO(Ddwf);
-  ConjugateGradient<LatticeFermion> CG(1.0e-5, 10000, 0);// switch off the assert
+  ConjugateGradient<LatticeFermion> CG(1.0e-5, 10000, 0);// switch off the GRID_ASSERT
 
   CGTimer.Start();
   CG(HermOpEO, src_o, result_o);
diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc
index 31b58284..1b8bc3c9 100644
--- a/tests/solver/Test_dwf_hdcr.cc
+++ b/tests/solver/Test_dwf_hdcr.cc
@@ -267,7 +267,7 @@ int main (int argc, char ** argv)
 
   Subspace Aggregates(Coarse5d,FGrid,0);
 
-  assert ( (nbasis & 0x1)==0);
+  GRID_ASSERT ( (nbasis & 0x1)==0);
   {
     int nb=nbasis/2;
     Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,500,100,100,0.0);
diff --git a/tests/solver/Test_dwf_hdcr_16_rb.cc b/tests/solver/Test_dwf_hdcr_16_rb.cc
index ae8e7ae5..ee613aad 100644
--- a/tests/solver/Test_dwf_hdcr_16_rb.cc
+++ b/tests/solver/Test_dwf_hdcr_16_rb.cc
@@ -281,7 +281,7 @@ int main (int argc, char ** argv)
 
   Subspace Aggregates(Coarse5d,FGrid,0);
 
-  assert ( (nbasis & 0x1)==0);
+  GRID_ASSERT ( (nbasis & 0x1)==0);
   {
     int nb=nbasis/2;
     LatticeFermion A(FGrid);
diff --git a/tests/solver/Test_dwf_hdcr_24_regression.cc b/tests/solver/Test_dwf_hdcr_24_regression.cc
index 88ae7fd2..023a9e8b 100644
--- a/tests/solver/Test_dwf_hdcr_24_regression.cc
+++ b/tests/solver/Test_dwf_hdcr_24_regression.cc
@@ -263,7 +263,7 @@ int main (int argc, char ** argv)
 
   Subspace Aggregates(Coarse5d,FGrid,0);
 
-  assert ( (nbasis & 0x1)==0);
+  GRID_ASSERT ( (nbasis & 0x1)==0);
   {
     int nb=nbasis/2;
     LatticeFermion A(FGrid);
diff --git a/tests/solver/Test_dwf_hdcr_2level.cc b/tests/solver/Test_dwf_hdcr_2level.cc
index 47e129f3..0975fa40 100644
--- a/tests/solver/Test_dwf_hdcr_2level.cc
+++ b/tests/solver/Test_dwf_hdcr_2level.cc
@@ -301,7 +301,7 @@ int main (int argc, char ** argv)
 
   Subspace Aggregates(Coarse5d,FGrid,0);
 
-  assert ( (nbasis & 0x1)==0);
+  GRID_ASSERT ( (nbasis & 0x1)==0);
   {
     int nb=nbasis/2;
     //    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,100,0.0);// 18s
diff --git a/tests/solver/Test_dwf_hdcr_48_rb.cc b/tests/solver/Test_dwf_hdcr_48_rb.cc
index 25ac1dac..3fef75ca 100644
--- a/tests/solver/Test_dwf_hdcr_48_rb.cc
+++ b/tests/solver/Test_dwf_hdcr_48_rb.cc
@@ -277,7 +277,7 @@ int main (int argc, char ** argv)
 
   Subspace Aggregates(Coarse5d,FGrid,0);
 
-  assert ( (nbasis & 0x1)==0);
+  GRID_ASSERT ( (nbasis & 0x1)==0);
   {
     int nb=nbasis/2;
     LatticeFermion A(FGrid);
diff --git a/tests/solver/Test_dwf_hdcr_48_regression.cc b/tests/solver/Test_dwf_hdcr_48_regression.cc
index 8c56c8f9..616ab7ca 100644
--- a/tests/solver/Test_dwf_hdcr_48_regression.cc
+++ b/tests/solver/Test_dwf_hdcr_48_regression.cc
@@ -261,7 +261,7 @@ int main (int argc, char ** argv)
 
   Subspace Aggregates(Coarse5d,FGrid,0);
 
-  assert ( (nbasis & 0x1)==0);
+  GRID_ASSERT ( (nbasis & 0x1)==0);
   {
     int nb=nbasis/2;
     LatticeFermion A(FGrid);
diff --git a/tests/solver/Test_dwf_multigrid.cc b/tests/solver/Test_dwf_multigrid.cc
index 1cd83375..0f933971 100644
--- a/tests/solver/Test_dwf_multigrid.cc
+++ b/tests/solver/Test_dwf_multigrid.cc
@@ -85,13 +85,13 @@ public:
   PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
 
   void OpDiag (const Field &in, Field &out) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
+    GRID_ASSERT(0);
   };
   void Op     (const Field &in, Field &out){
     Field tmp(in.Grid());
@@ -104,10 +104,10 @@ public:
     _Mat.Mdag(in,tmp);
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    assert(0);
+    GRID_ASSERT(0);
   }
   void HermOp(const Field &in, Field &out){
-    assert(0);
+    GRID_ASSERT(0);
   }
 };
 
@@ -411,7 +411,7 @@ int main (int argc, char ** argv)
 
   Subspace Aggregates(Coarse5d,FGrid,0);
 
-  assert ( (nbasis & 0x1)==0);
+  GRID_ASSERT ( (nbasis & 0x1)==0);
   {
     int nb=nbasis/2;
     Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,500,100,100,0.0);
diff --git a/tests/solver/Test_dwf_multishift_mixedprec.cc b/tests/solver/Test_dwf_multishift_mixedprec.cc
index 63ffe1c6..ba46b597 100644
--- a/tests/solver/Test_dwf_multishift_mixedprec.cc
+++ b/tests/solver/Test_dwf_multishift_mixedprec.cc
@@ -69,7 +69,7 @@ void run_test(int argc, char ** argv, const typename SpeciesD::ImplParams &param
   bool cfg_loaded=false;
   for(int i=1;i<argc;i++){
     if(std::string(argv[i]) == "--load_config"){
-      assert(i != argc-1);
+      GRID_ASSERT(i != argc-1);
       std::string file = argv[i+1];
       NerscIO io;
       FieldMetaData metadata;
@@ -158,9 +158,9 @@ int main (int argc, char ** argv)
   for(int i=1;i<argc;i++){
     std::string arg(argv[i]);
     if(arg == "--Gparity"){
-      assert(i!=argc-1);
+      GRID_ASSERT(i!=argc-1);
       gpdir = std::stoi(argv[i+1]);
-      assert(gpdir >= 0 && gpdir <= 2); //spatial!
+      GRID_ASSERT(gpdir >= 0 && gpdir <= 2); //spatial!
       gparity = true;
     }
   }
diff --git a/tests/solver/Test_eofa_inv.cc b/tests/solver/Test_eofa_inv.cc
index 71952b97..de4ee49e 100644
--- a/tests/solver/Test_eofa_inv.cc
+++ b/tests/solver/Test_eofa_inv.cc
@@ -106,7 +106,7 @@ int main (int argc, char** argv)
   LatticeFermion diff = MinvMeta - eta;
 
   std::cout << GridLogMessage << "eta: " << norm2(eta) << " M*eta: " << norm2(Meta) << " M^{-1}*M*eta: " << norm2(MinvMeta) << "  M^{-1}*M*eta - eta: " << norm2(diff) << " (expect 0)" << std::endl;
-  assert(norm2(diff) < 1e-8);
+  GRID_ASSERT(norm2(diff) < 1e-8);
 
   //Check right inverse
   LatticeFermion MinvEta(FGrid);
@@ -118,7 +118,7 @@ int main (int argc, char** argv)
   diff = MMinvEta - eta;
   
   std::cout << GridLogMessage << "eta: " << norm2(eta) << " M^{-1}*eta: " << norm2(MinvEta) << " M*M^{-1}*eta: " << norm2(MMinvEta) << "  M*M^{-1}*eta - eta: " << norm2(diff) << " (expect 0)" << std::endl;
-  assert(norm2(diff) < 1e-8);
+  GRID_ASSERT(norm2(diff) < 1e-8);
 
   std::cout << GridLogMessage << "Done" << std::endl;
   Grid_finalize();
diff --git a/tests/solver/Test_hw_multigrid.cc b/tests/solver/Test_hw_multigrid.cc
index fd30bca7..6714d7f2 100644
--- a/tests/solver/Test_hw_multigrid.cc
+++ b/tests/solver/Test_hw_multigrid.cc
@@ -82,13 +82,13 @@ public:
   PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
 
   void OpDiag (const Field &in, Field &out) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
+    GRID_ASSERT(0);
   };
   void Op     (const Field &in, Field &out){
     Field tmp(in.Grid());
@@ -101,10 +101,10 @@ public:
     _Mat.Mdag(in,tmp);
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    assert(0);
+    GRID_ASSERT(0);
   }
   void HermOp(const Field &in, Field &out){
-    assert(0);
+    GRID_ASSERT(0);
   }
 };
 
@@ -309,7 +309,7 @@ int main (int argc, char ** argv)
   Subspace Aggregates4D(Coarse4d,UGrid,0);
   Subspace Aggregates5D(Coarse5d,FGrid,0);
 
-  assert ( (nbasis & 0x1)==0);
+  GRID_ASSERT ( (nbasis & 0x1)==0);
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << " 4D subspace build                                " <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
diff --git a/tests/solver/Test_hw_multigrid_mixed_48.cc b/tests/solver/Test_hw_multigrid_mixed_48.cc
index 3a31ddbe..236cdc1d 100644
--- a/tests/solver/Test_hw_multigrid_mixed_48.cc
+++ b/tests/solver/Test_hw_multigrid_mixed_48.cc
@@ -109,7 +109,7 @@ public:
     ///////////////////////////////////////////////////////////
     // The Cayley coeffs (unprec)
     ///////////////////////////////////////////////////////////
-    assert(gamma.size()==Ls);
+    GRID_ASSERT(gamma.size()==Ls);
 
     omega.resize(Ls);
     bs.resize(Ls);
@@ -125,7 +125,7 @@ public:
     for(int i=0; i < Ls; i++){
       as[i] = 1.0;
       omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
-      assert(omega[i]!=Coeff_t(0.0));
+      GRID_ASSERT(omega[i]!=Coeff_t(0.0));
       bs[i] = 0.5*(bpc/omega[i] + bmc);
       cs[i] = 0.5*(bpc/omega[i] - bmc);
     }
@@ -140,7 +140,7 @@ public:
     
     for(int i=0;i<Ls;i++){
       bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-      assert(bee[i]!=Coeff_t(0.0));
+      GRID_ASSERT(bee[i]!=Coeff_t(0.0));
       cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
       beo[i]=as[i]*bs[i];
       ceo[i]=-as[i]*cs[i];
@@ -174,7 +174,7 @@ public:
       
 	leem[i]=mass*cee[Ls-1]/bee[0];
 	for(int j=0;j<i;j++) {
-	  assert(bee[j+1]!=Coeff_t(0.0));
+	  GRID_ASSERT(bee[j+1]!=Coeff_t(0.0));
 	  leem[i]*= aee[j]/bee[j+1];
 	}
       
@@ -316,7 +316,7 @@ public:
     autoView(psi , psi_i,AcceleratorRead);
     autoView(phi , phi_i,AcceleratorRead);
     autoView(chi , chi_i,AcceleratorWrite);
-    assert(phi.Checkerboard() == psi.Checkerboard());
+    GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
 
     auto pdiag = &diag[0];
     auto pupper = &upper[0];
@@ -354,7 +354,7 @@ public:
     autoView(psi , psi_i,AcceleratorRead);
     autoView(phi , phi_i,AcceleratorRead);
     autoView(chi , chi_i,AcceleratorWrite);
-    assert(phi.Checkerboard() == psi.Checkerboard());
+    GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
     
     auto pdiag = &diag[0];
     auto pupper = &upper[0];
@@ -438,7 +438,7 @@ public:
   }
   virtual  void Mdir     (const CoarseVector &in, CoarseVector &out,int dir, int disp)
   {
-    assert(0);
+    GRID_ASSERT(0);
   }
   virtual  void MdirAll  (const CoarseVector &in, std::vector<CoarseVector> &out)
   {
@@ -679,13 +679,13 @@ public:
   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
 
   void OpDiag (const Field &in, Field &out) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
+    GRID_ASSERT(0);
   };
   void Op     (const Field &in, Field &out){
     Field tmp(in.Grid());
@@ -698,10 +698,10 @@ public:
     _Mat.Mdag(in,tmp);
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    assert(0);
+    GRID_ASSERT(0);
   }
   void HermOp(const Field &in, Field &out){
-    assert(0);
+    GRID_ASSERT(0);
   }
 };
 
@@ -1024,7 +1024,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << " 4D subspace build                                " <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   Subspace Aggregates4D(Coarse4d,UGrid,0);
-  assert ( (nbasis & 0x1)==0);
+  GRID_ASSERT ( (nbasis & 0x1)==0);
   int nb=nbasis/2;
   Gamma g5(Gamma::Algebra::Gamma5);
 
diff --git a/tests/solver/Test_hw_multigrid_mixed_48_rb.cc b/tests/solver/Test_hw_multigrid_mixed_48_rb.cc
index 0f18893e..d8a06f80 100644
--- a/tests/solver/Test_hw_multigrid_mixed_48_rb.cc
+++ b/tests/solver/Test_hw_multigrid_mixed_48_rb.cc
@@ -109,7 +109,7 @@ public:
     ///////////////////////////////////////////////////////////
     // The Cayley coeffs (unprec)
     ///////////////////////////////////////////////////////////
-    assert(gamma.size()==Ls);
+    GRID_ASSERT(gamma.size()==Ls);
 
     omega.resize(Ls);
     bs.resize(Ls);
@@ -125,7 +125,7 @@ public:
     for(int i=0; i < Ls; i++){
       as[i] = 1.0;
       omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
-      assert(omega[i]!=Coeff_t(0.0));
+      GRID_ASSERT(omega[i]!=Coeff_t(0.0));
       bs[i] = 0.5*(bpc/omega[i] + bmc);
       cs[i] = 0.5*(bpc/omega[i] - bmc);
     }
@@ -140,7 +140,7 @@ public:
     
     for(int i=0;i<Ls;i++){
       bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-      assert(bee[i]!=Coeff_t(0.0));
+      GRID_ASSERT(bee[i]!=Coeff_t(0.0));
       cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
       beo[i]=as[i]*bs[i];
       ceo[i]=-as[i]*cs[i];
@@ -174,7 +174,7 @@ public:
       
 	leem[i]=mass*cee[Ls-1]/bee[0];
 	for(int j=0;j<i;j++) {
-	  assert(bee[j+1]!=Coeff_t(0.0));
+	  GRID_ASSERT(bee[j+1]!=Coeff_t(0.0));
 	  leem[i]*= aee[j]/bee[j+1];
 	}
       
@@ -316,7 +316,7 @@ public:
     autoView(psi , psi_i,AcceleratorRead);
     autoView(phi , phi_i,AcceleratorRead);
     autoView(chi , chi_i,AcceleratorWrite);
-    assert(phi.Checkerboard() == psi.Checkerboard());
+    GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
 
     auto pdiag = &diag[0];
     auto pupper = &upper[0];
@@ -354,7 +354,7 @@ public:
     autoView(psi , psi_i,AcceleratorRead);
     autoView(phi , phi_i,AcceleratorRead);
     autoView(chi , chi_i,AcceleratorWrite);
-    assert(phi.Checkerboard() == psi.Checkerboard());
+    GRID_ASSERT(phi.Checkerboard() == psi.Checkerboard());
     
     auto pdiag = &diag[0];
     auto pupper = &upper[0];
@@ -438,7 +438,7 @@ public:
   }
   virtual  void Mdir     (const CoarseVector &in, CoarseVector &out,int dir, int disp)
   {
-    assert(0);
+    GRID_ASSERT(0);
   }
   virtual  void MdirAll  (const CoarseVector &in, std::vector<CoarseVector> &out)
   {
@@ -699,13 +699,13 @@ public:
   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
 
   void OpDiag (const Field &in, Field &out) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDir  (const Field &in, Field &out,int dir,int disp) {
-    assert(0);
+    GRID_ASSERT(0);
   }
   void OpDirAll  (const Field &in, std::vector<Field> &out){
-    assert(0);
+    GRID_ASSERT(0);
   };
   void Op     (const Field &in, Field &out){
     Field tmp(in.Grid());
@@ -718,10 +718,10 @@ public:
     _Mat.Mdag(in,tmp);
   }
   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    assert(0);
+    GRID_ASSERT(0);
   }
   void HermOp(const Field &in, Field &out){
-    assert(0);
+    GRID_ASSERT(0);
   }
 };
 
@@ -1048,7 +1048,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << " 4D subspace build                                " <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   Subspace Aggregates4D(Coarse4d,UGrid,0);
-  assert ( (nbasis & 0x1)==0);
+  GRID_ASSERT ( (nbasis & 0x1)==0);
   int nb=nbasis/2;
   Gamma g5(Gamma::Algebra::Gamma5);
 
diff --git a/tests/solver/Test_multigrid_common.h b/tests/solver/Test_multigrid_common.h
index 0cb63530..c1570a38 100644
--- a/tests/solver/Test_multigrid_common.h
+++ b/tests/solver/Test_multigrid_common.h
@@ -86,13 +86,13 @@ void checkParameterValidity(MultiGridParams const &params) {
 
   auto correctSize = params.nLevels - 1;
 
-  assert(correctSize == params.blockSizes.size());
-  assert(correctSize == params.smootherTol.size());
-  assert(correctSize == params.smootherMaxOuterIter.size());
-  assert(correctSize == params.smootherMaxInnerIter.size());
-  assert(correctSize == params.kCycleTol.size());
-  assert(correctSize == params.kCycleMaxOuterIter.size());
-  assert(correctSize == params.kCycleMaxInnerIter.size());
+  GRID_ASSERT(correctSize == params.blockSizes.size());
+  GRID_ASSERT(correctSize == params.smootherTol.size());
+  GRID_ASSERT(correctSize == params.smootherMaxOuterIter.size());
+  GRID_ASSERT(correctSize == params.smootherMaxInnerIter.size());
+  GRID_ASSERT(correctSize == params.kCycleTol.size());
+  GRID_ASSERT(correctSize == params.kCycleMaxOuterIter.size());
+  GRID_ASSERT(correctSize == params.kCycleMaxInnerIter.size());
 }
 
 struct LevelInfo {
@@ -105,7 +105,7 @@ public:
 
     auto nCoarseLevels = mgParams.blockSizes.size();
 
-    assert(nCoarseLevels == mgParams.nLevels - 1);
+    GRID_ASSERT(nCoarseLevels == mgParams.nLevels - 1);
 
     // set up values for finest grid
     Grids.push_back(FineGrid);
@@ -117,7 +117,7 @@ public:
     for(int level = 1; level < mgParams.nLevels; ++level) {
       auto Nd  = Grids[level - 1]->_ndimension;
       auto tmp = Grids[level - 1]->_fdimensions;
-      assert(tmp.size() == Nd);
+      GRID_ASSERT(tmp.size() == Nd);
 
       Seeds.push_back(std::vector<int>(Nd));
 
diff --git a/tests/solver/Test_wilson_mg.cc b/tests/solver/Test_wilson_mg.cc
index 875bf32a..6ab2295e 100644
--- a/tests/solver/Test_wilson_mg.cc
+++ b/tests/solver/Test_wilson_mg.cc
@@ -56,7 +56,7 @@ int main(int argc, char **argv) {
 
   if(GridCmdOptionExists(argv, argv + argc, "--inputxml")) {
     inputXml = GridCmdOptionPayload(argv, argv + argc, "--inputxml");
-    assert(inputXml.length() != 0);
+    GRID_ASSERT(inputXml.length() != 0);
   }
 
   {
diff --git a/tests/solver/Test_wilson_mg_mp.cc b/tests/solver/Test_wilson_mg_mp.cc
index 89bbbf74..9161593c 100644
--- a/tests/solver/Test_wilson_mg_mp.cc
+++ b/tests/solver/Test_wilson_mg_mp.cc
@@ -63,7 +63,7 @@ int main(int argc, char **argv) {
 
   if(GridCmdOptionExists(argv, argv + argc, "--inputxml")) {
     inputXml = GridCmdOptionPayload(argv, argv + argc, "--inputxml");
-    assert(inputXml.length() != 0);
+    GRID_ASSERT(inputXml.length() != 0);
   }
 
   {
diff --git a/tests/solver/Test_wilsonclover_mg.cc b/tests/solver/Test_wilsonclover_mg.cc
index 1b0e8bb7..116a379c 100644
--- a/tests/solver/Test_wilsonclover_mg.cc
+++ b/tests/solver/Test_wilsonclover_mg.cc
@@ -59,7 +59,7 @@ int main(int argc, char **argv) {
 
   if(GridCmdOptionExists(argv, argv + argc, "--inputxml")) {
     inputXml = GridCmdOptionPayload(argv, argv + argc, "--inputxml");
-    assert(inputXml.length() != 0);
+    GRID_ASSERT(inputXml.length() != 0);
   }
 
   {
diff --git a/tests/solver/Test_wilsonclover_mg_lime.cc b/tests/solver/Test_wilsonclover_mg_lime.cc
index 0a29c034..12d84f4d 100644
--- a/tests/solver/Test_wilsonclover_mg_lime.cc
+++ b/tests/solver/Test_wilsonclover_mg_lime.cc
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
 
   if(GridCmdOptionExists(argv, argv + argc, "--inputxml")) {
     inputXml = GridCmdOptionPayload(argv, argv + argc, "--inputxml");
-    assert(inputXml.length() != 0);
+    GRID_ASSERT(inputXml.length() != 0);
   }
 
   {
diff --git a/tests/solver/Test_wilsonclover_mg_mp.cc b/tests/solver/Test_wilsonclover_mg_mp.cc
index 2efe5f08..02322a82 100644
--- a/tests/solver/Test_wilsonclover_mg_mp.cc
+++ b/tests/solver/Test_wilsonclover_mg_mp.cc
@@ -65,7 +65,7 @@ int main(int argc, char **argv) {
 
   if(GridCmdOptionExists(argv, argv + argc, "--inputxml")) {
     inputXml = GridCmdOptionPayload(argv, argv + argc, "--inputxml");
-    assert(inputXml.length() != 0);
+    GRID_ASSERT(inputXml.length() != 0);
   }
 
   {
diff --git a/tests/solver/Test_zMADWF_prec.cc b/tests/solver/Test_zMADWF_prec.cc
index d1168764..eb1be6e2 100644
--- a/tests/solver/Test_zMADWF_prec.cc
+++ b/tests/solver/Test_zMADWF_prec.cc
@@ -176,7 +176,7 @@ void run(const TestParams &params){
     NerscIO::readConfiguration(Umu, header, params.config_file);
 
     for(int i=0;i<Nd;i++){
-      assert(header.dimension[i] == GridDefaultLatt()[i]);
+      GRID_ASSERT(header.dimension[i] == GridDefaultLatt()[i]);
     }
   }else{    
     SU<Nc>::HotConfiguration(RNG4, Umu);
@@ -297,7 +297,7 @@ int main(int argc, char** argv) {
     run<RunParamsPrecStd, RunParamsPrecDiagTwo>(params);
   }else if(params.outer_precon == "DiagTwo" && params.inner_precon == "DiagTwo"){
     run<RunParamsPrecDiagTwo, RunParamsPrecDiagTwo>(params);
-  }else assert(0);
+  }else GRID_ASSERT(0);
 
   Grid_finalize();
 }
diff --git a/tests/solver/Test_zmobius_cg_prec.cc b/tests/solver/Test_zmobius_cg_prec.cc
index 7f1f98b8..bd56bc2f 100644
--- a/tests/solver/Test_zmobius_cg_prec.cc
+++ b/tests/solver/Test_zmobius_cg_prec.cc
@@ -111,7 +111,7 @@ int main(int argc, char** argv) {
   GridStopWatch CGTimer;
 
   SchurDiagMooeeOperator<ZMobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8, 10000, 0);// switch off the assert
+  ConjugateGradient<LatticeFermion> CG(1.0e-8, 10000, 0);// switch off the GRID_ASSERT
 
   CGTimer.Start();
   CG(HermOpEO, src_o, result_o);
diff --git a/tests/sp2n/Test_2as_base.cc b/tests/sp2n/Test_2as_base.cc
index 3aeccae0..b1ca7d6b 100644
--- a/tests/sp2n/Test_2as_base.cc
+++ b/tests/sp2n/Test_2as_base.cc
@@ -16,7 +16,7 @@ static void check_dimensions() {
     std::cout << GridLogMessage << "Nc = " << this_n << " algebra dimension is " << this_algebra_dim << std::endl;
     realA = Sp_TwoIndex<this_nc, AntiSymmetric>::Dimension + Sp_TwoIndex<this_nc, Symmetric>::Dimension;
     std::cout << GridLogMessage << "Checking dim(2AS) + dim(AS) + 1 = Nc * Nc " << this_algebra_dim << std::endl;
-    assert ( realA == this_nc * this_nc - 1); // Nc x Nc = dim(2indxS) + dim(2indxAS) + dim(singlet)
+    GRID_ASSERT ( realA == this_nc * this_nc - 1); // Nc x Nc = dim(2indxS) + dim(2indxAS) + dim(singlet)
 }
 
 template<int this_nc, TwoIndexSymmetry S>
@@ -35,7 +35,7 @@ static void run_symmetry_checks() {
         Sp_TwoIndex<this_nc, S>::base(a, eij_c);
         e_sum = eij_c - realS * transpose(eij_c);
         std::cout << GridLogMessage << "e_ab - (" << S << " * e_ab^T ) = " << norm2(e_sum) << std::endl;
-        assert(norm2(e_sum) < 1e-8);
+        GRID_ASSERT(norm2(e_sum) < 1e-8);
           
     }
 }
@@ -59,7 +59,7 @@ static void run_traces_checks() {
         realA = norm2(trace(Omega*eij_a));
         std::cout << GridLogMessage << "Checkig Omega-trace for e_{ab=" << a << "} " << std::endl;
         //std::cout << GridLogMessage << "Tr ( Omega e_{ab=" << a << "} ) = " << realA << std::endl;
-        assert(realA < 1e-8);
+        GRID_ASSERT(realA < 1e-8);
         for (int b=0; b < Sp_TwoIndex<this_nc, S>::Dimension; b++) {
             Sp_TwoIndex<this_nc, S>::base(b, eij_b);
             auto d_ab = TensorRemove(trace(eij_a * eij_b));
@@ -68,12 +68,12 @@ static void run_traces_checks() {
     #endif
             std::cout << GridLogMessage << "Checking orthonormality for e_{ab = " << a << "} " << std::endl;
             if (a==b) {
-                assert(real(d_ab) - realS < 1e-8);
+                GRID_ASSERT(real(d_ab) - realS < 1e-8);
             } else {
-                assert(real(d_ab) < 1e-8);
+                GRID_ASSERT(real(d_ab) < 1e-8);
             }
-            assert(imag(d_ab) < 1e-8);
-            assert(imag(d_ab) < 1e-8);
+            GRID_ASSERT(imag(d_ab) < 1e-8);
+            GRID_ASSERT(imag(d_ab) < 1e-8);
         }
     }
     
@@ -118,8 +118,8 @@ static void run_generators_checks() {
             sum_im += imag(TensorRemove(trace(tmp_l+tmp_r)));
         }
         std::cout << GridLogMessage << "re-evaluated trace of the generator " << gen_id << " is " << sum << " " << sum_im << std::endl;
-        assert ( sum < 1e-8) ;
-        assert ( sum_im < 1e-8) ;
+        GRID_ASSERT ( sum < 1e-8) ;
+        GRID_ASSERT ( sum_im < 1e-8) ;
     }
     
 }
diff --git a/tests/sp2n/Test_Sp_start.cc b/tests/sp2n/Test_Sp_start.cc
index dab5819e..ba21902b 100644
--- a/tests/sp2n/Test_Sp_start.cc
+++ b/tests/sp2n/Test_Sp_start.cc
@@ -19,9 +19,9 @@ bool has_correct_group_block_structure(const T& U) {
       auto Ww = conjugate(Wstar);
       auto amizero = sum(W - Ww);
       auto amizeroo = TensorRemove(amizero);
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
       amizeroo *= i;
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
     }
   }
 
@@ -32,9 +32,9 @@ bool has_correct_group_block_structure(const T& U) {
       auto minusXx = conjugate(minusXstar);
       auto amizero = sum(X + minusXx);
       auto amizeroo = TensorRemove(amizero);
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
       amizeroo *= i;
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
     }
   }
   return true;
@@ -49,22 +49,22 @@ bool is_element_of_sp2n_group(const T& U) {
   Sp<Nc>::Omega(Omega);
 
   std::cout << GridLogMessage << "Check matrix is non-zero " << std::endl;
-  assert(norm2(U) > 1e-8);
+  GRID_ASSERT(norm2(U) > 1e-8);
 
   std::cout << GridLogMessage << "Unitary check" << std::endl;
   aux = U * adj(U) - identity;
   std::cout << GridLogMessage << "U adjU - 1 = " << norm2(aux) << std::endl;
-  assert(norm2(aux) < 1e-8);
+  GRID_ASSERT(norm2(aux) < 1e-8);
 
   aux = Omega - (U * Omega * transpose(U));
   std::cout << GridLogMessage << "Omega - U Omega transpose(U) = " << norm2(aux)
             << std::endl;
-  assert(norm2(aux) < 1e-8);
+  GRID_ASSERT(norm2(aux) < 1e-8);
 
   std::cout << GridLogMessage
             << "|Det| = " << norm2(Determinant(U)) / U.Grid()->gSites()
             << std::endl;
-  assert(norm2(Determinant(U)) / U.Grid()->gSites() - 1 < 1e-8);
+  GRID_ASSERT(norm2(Determinant(U)) / U.Grid()->gSites() - 1 < 1e-8);
 
   return has_correct_group_block_structure(U);
 }
@@ -91,17 +91,17 @@ int main (int argc, char **argv)
     std::cout << GridLogMessage << "Checking Cold Configuration " << std::endl;
     Sp<Nc>::ColdConfiguration(pRNG,Umu);
     U = PeekIndex<LorentzIndex>(Umu,1);
-    assert(is_element_of_sp2n_group(U));
+    GRID_ASSERT(is_element_of_sp2n_group(U));
     
     std::cout << GridLogMessage << "Checking Hot Configuration" << std::endl;
     Sp<Nc>::HotConfiguration(pRNG,Umu);
     U = PeekIndex<LorentzIndex>(Umu,1);
-    assert(is_element_of_sp2n_group(U));
+    GRID_ASSERT(is_element_of_sp2n_group(U));
     
     std::cout << GridLogMessage << "Checking Tepid Configuration" << std::endl;
     Sp<Nc>::TepidConfiguration(pRNG,Umu);
     U = PeekIndex<LorentzIndex>(Umu,1);
-    assert(is_element_of_sp2n_group(U));
+    GRID_ASSERT(is_element_of_sp2n_group(U));
     
     Grid_finalize();
 
diff --git a/tests/sp2n/Test_project_on_Sp.cc b/tests/sp2n/Test_project_on_Sp.cc
index 63032a68..28e23b62 100644
--- a/tests/sp2n/Test_project_on_Sp.cc
+++ b/tests/sp2n/Test_project_on_Sp.cc
@@ -19,9 +19,9 @@ bool has_correct_group_block_structure(const T& U) {
       auto Ww = conjugate(Wstar);
       auto amizero = sum(W - Ww);
       auto amizeroo = TensorRemove(amizero);
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
       amizeroo *= i;
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
     }
   }
 
@@ -32,9 +32,9 @@ bool has_correct_group_block_structure(const T& U) {
       auto minusXx = conjugate(minusXstar);
       auto amizero = sum(X + minusXx);
       auto amizeroo = TensorRemove(amizero);
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
       amizeroo *= i;
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
     }
   }
   return true;
@@ -49,22 +49,22 @@ bool is_element_of_sp2n_group(const T& U) {
   Sp<Nc>::Omega(Omega);
 
   std::cout << GridLogMessage << "Check matrix is non-zero " << std::endl;
-  assert(norm2(U) > 1e-8);
+  GRID_ASSERT(norm2(U) > 1e-8);
 
   std::cout << GridLogMessage << "Unitary check" << std::endl;
   aux = U * adj(U) - identity;
   std::cout << GridLogMessage << "U adjU - 1 = " << norm2(aux) << std::endl;
-  assert(norm2(aux) < 1e-8);
+  GRID_ASSERT(norm2(aux) < 1e-8);
 
   aux = Omega - (U * Omega * transpose(U));
   std::cout << GridLogMessage << "Omega - U Omega transpose(U) = " << norm2(aux)
             << std::endl;
-  assert(norm2(aux) < 1e-8);
+  GRID_ASSERT(norm2(aux) < 1e-8);
 
   std::cout << GridLogMessage
             << "|Det| = " << norm2(Determinant(U)) / U.Grid()->gSites()
             << std::endl;
-  assert(norm2(Determinant(U)) / U.Grid()->gSites() - 1 < 1e-8);
+  GRID_ASSERT(norm2(Determinant(U)) / U.Grid()->gSites() - 1 < 1e-8);
 
   return has_correct_group_block_structure(U);
 }
@@ -86,7 +86,7 @@ void test_group_projections(T U) {
 
   U = U + Delta * identity;
   U = ProjectOnSpGroup(U);
-  assert(is_element_of_sp2n_group(U));
+  GRID_ASSERT(is_element_of_sp2n_group(U));
 
   name = "ProjectOnGeneralGroup";
   std::cout << GridLogMessage << "Testing " << name << std::endl;
@@ -94,7 +94,7 @@ void test_group_projections(T U) {
 
   U = U + Delta * identity;
   U = Sp<Nc>::ProjectOnGeneralGroup(U);
-  assert(is_element_of_sp2n_group(U));
+  GRID_ASSERT(is_element_of_sp2n_group(U));
 
   name = "ProjectOnSpecialGroup";
   std::cout << GridLogMessage << "Testing " << name << std::endl;
@@ -102,7 +102,7 @@ void test_group_projections(T U) {
 
   U = U + Delta * identity;
   Sp<Nc>::ProjectOnSpecialGroup(U);
-  assert(is_element_of_sp2n_group(U));
+  GRID_ASSERT(is_element_of_sp2n_group(U));
 
   name = "ProjectSpn";
   std::cout << GridLogMessage << "Testing " << name << std::endl;
@@ -110,7 +110,7 @@ void test_group_projections(T U) {
 
   U = U + Delta * identity;
   ProjectSpn(U);
-  assert(is_element_of_sp2n_group(U));
+  GRID_ASSERT(is_element_of_sp2n_group(U));
 }
 
 template <typename T>
@@ -129,9 +129,9 @@ bool has_correct_algebra_block_structure(const T& U) {
       auto Ww = conjugate(Wstar);
       auto amizero = sum(W - Ww);
       auto amizeroo = TensorRemove(amizero);
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
       amizeroo *= i;
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
     }
   }
   for (int c1 = 0; c1 < nsp; c1++) {
@@ -141,9 +141,9 @@ bool has_correct_algebra_block_structure(const T& U) {
       auto minusXx = conjugate(minusXstar);
       auto amizero = sum(X + minusXx);
       auto amizeroo = TensorRemove(amizero);
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
       amizeroo *= i;
-      assert(amizeroo.real() < 10e-6);
+      GRID_ASSERT(amizeroo.real() < 10e-6);
     }
   }
 
@@ -159,7 +159,7 @@ bool is_element_of_sp2n_algebra(const T& U) {
   Sp<Nc>::Omega(Omega);
 
   std::cout << GridLogMessage << "Check matrix is non-zero " << std::endl;
-  assert(norm2(U) > 1e-8);
+  GRID_ASSERT(norm2(U) > 1e-8);
 
   aux = U - adj(U);
   std::cout << GridLogMessage << "T - Tda = " << norm2(aux)
@@ -168,12 +168,12 @@ bool is_element_of_sp2n_algebra(const T& U) {
   aux = U + adj(U);
   std::cout << GridLogMessage << "T + Tda = " << norm2(aux)
             << " (supposed to vanish)" << std::endl;
-  assert(norm2(aux) - 1 < 1e-8);
+  GRID_ASSERT(norm2(aux) - 1 < 1e-8);
 
   std::cout << GridLogMessage << "Check that Omega T Omega + conj(T) = 0 "
             << std::endl;
   aux = Omega * U * Omega + conjugate(U);
-  assert(norm2(aux) < 1e-8);
+  GRID_ASSERT(norm2(aux) < 1e-8);
 
   return has_correct_algebra_block_structure(U);
 }
@@ -196,7 +196,7 @@ void test_algebra_projections(T U) {
 
   U = U + Delta * identity;
   U = SpTa(U);
-  assert(is_element_of_sp2n_algebra(U));
+  GRID_ASSERT(is_element_of_sp2n_algebra(U));
 
   name = "TaProj";
   std::cout << GridLogMessage << "Testing " << name << std::endl;
@@ -205,7 +205,7 @@ void test_algebra_projections(T U) {
   U = U + Delta * identity;
   Sp<Nc>::taProj(U, tmp);
   U = tmp;
-  assert(is_element_of_sp2n_algebra(U));
+  GRID_ASSERT(is_element_of_sp2n_algebra(U));
 }
 
 int main(int argc, char** argv) {