diff --git a/Grid/Makefile.am b/Grid/Makefile.am index ded6d146..7c3c151b 100644 --- a/Grid/Makefile.am +++ b/Grid/Makefile.am @@ -54,9 +54,11 @@ Version.h: version-cache include Make.inc include Eigen.inc -extra_sources+=$(ZWILS_FERMION_FILES) extra_sources+=$(WILS_FERMION_FILES) extra_sources+=$(STAG_FERMION_FILES) +if BUILD_ZMOBIUS + extra_sources+=$(ZWILS_FERMION_FILES) +endif if BUILD_GPARITY extra_sources+=$(GP_FERMION_FILES) endif diff --git a/Grid/cartesian/Cartesian_red_black.h b/Grid/cartesian/Cartesian_red_black.h index b71981f5..092d4910 100644 --- a/Grid/cartesian/Cartesian_red_black.h +++ b/Grid/cartesian/Cartesian_red_black.h @@ -36,7 +36,7 @@ static const int CbBlack=1; static const int Even =CbRed; static const int Odd =CbBlack; -accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk) +accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk) { int nd=rdim.size(); Coordinate coor(nd); diff --git a/Grid/qcd/action/fermion/FermionOperatorImpl.h b/Grid/qcd/action/fermion/FermionOperatorImpl.h index b444f6dc..9345c0e6 100644 --- a/Grid/qcd/action/fermion/FermionOperatorImpl.h +++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h @@ -153,8 +153,8 @@ public: typedef typename Impl::StencilImpl StencilImpl; \ typedef typename Impl::ImplParams ImplParams; \ typedef typename Impl::StencilImpl::View_type StencilView; \ - typedef typename ViewMap::Type FermionFieldView; \ - typedef typename ViewMap::Type DoubledGaugeFieldView; + typedef const typename ViewMap::Type FermionFieldView; \ + typedef const typename ViewMap::Type DoubledGaugeFieldView; #define INHERIT_IMPL_TYPES(Base) \ INHERIT_GIMPL_TYPES(Base) \ diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 10e98f33..0760bcba 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -61,7 +61,7 @@ public: typedef typename SiteHalfSpinor::vector_type vComplexHigh; constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - accelerator_inline int CommDatumSize(void) { + accelerator_inline int CommDatumSize(void) const { return sizeof(SiteHalfCommSpinor); } @@ -69,7 +69,7 @@ public: /* Compress includes precision change if mpi data is not same */ /*****************************************************/ template - accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) { + accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const { _SiteHalfSpinor tmp; projector::Proj(tmp,in,mu,dag); vstream(buf[o],tmp); @@ -81,7 +81,7 @@ public: accelerator_inline void Exchange(SiteHalfSpinor *mp, const SiteHalfSpinor * __restrict__ vp0, const SiteHalfSpinor * __restrict__ vp1, - Integer type,Integer o){ + Integer type,Integer o) const { SiteHalfSpinor tmp1; SiteHalfSpinor tmp2; exchange(tmp1,tmp2,vp0[o],vp1[o],type); @@ -93,7 +93,7 @@ public: /* Have a decompression step if mpi data is not same */ /*****************************************************/ accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out, - SiteHalfSpinor * __restrict__ in, Integer o) { + SiteHalfSpinor * __restrict__ in, Integer o) const { assert(0); } @@ -103,7 +103,7 @@ public: accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0, SiteHalfSpinor * __restrict__ out1, const SiteSpinor * __restrict__ in, - Integer j,Integer k, Integer m,Integer type) + Integer j,Integer k, Integer m,Integer type) const { SiteHalfSpinor temp1, temp2; SiteHalfSpinor temp3, temp4; @@ -117,7 +117,7 @@ public: /*****************************************************/ /* Pass the info to the stencil */ /*****************************************************/ - accelerator_inline bool DecompressionStep(void) { return false; } + accelerator_inline bool DecompressionStep(void) const { return false; } }; @@ -142,7 +142,7 @@ public: typedef typename SiteHalfSpinor::vector_type vComplexHigh; constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - accelerator_inline int CommDatumSize(void) { + accelerator_inline int CommDatumSize(void) const { return sizeof(SiteHalfCommSpinor); } @@ -150,7 +150,7 @@ public: /* Compress includes precision change if mpi data is not same */ /*****************************************************/ template - accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) { + accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const { _SiteHalfSpinor hsp; SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; projector::Proj(hsp,in,mu,dag); @@ -163,7 +163,7 @@ public: accelerator_inline void Exchange(SiteHalfSpinor *mp, SiteHalfSpinor *vp0, SiteHalfSpinor *vp1, - Integer type,Integer o){ + Integer type,Integer o) const { SiteHalfSpinor vt0,vt1; SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0; SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1; @@ -175,7 +175,7 @@ public: /*****************************************************/ /* Have a decompression step if mpi data is not same */ /*****************************************************/ - accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){ + accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const { SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in; precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw); } @@ -186,7 +186,7 @@ public: accelerator_inline void CompressExchange(SiteHalfSpinor *out0, SiteHalfSpinor *out1, const SiteSpinor *in, - Integer j,Integer k, Integer m,Integer type){ + Integer j,Integer k, Integer m,Integer type) const { SiteHalfSpinor temp1, temp2,temp3,temp4; SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0; SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1; @@ -200,7 +200,7 @@ public: /*****************************************************/ /* Pass the info to the stencil */ /*****************************************************/ - accelerator_inline bool DecompressionStep(void) { return true; } + accelerator_inline bool DecompressionStep(void) const { return true; } }; diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index d7941d1f..94676b6b 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -72,7 +72,7 @@ public: typedef WilsonCompressor Compressor; typedef WilsonImplParams ImplParams; typedef WilsonStencil StencilImpl; - typedef typename StencilImpl::View_type StencilView; + typedef const typename StencilImpl::View_type StencilView; ImplParams Params; diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index b867369f..688cb75a 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -93,7 +93,7 @@ Author: paboyle Chimu_32=coalescedReadPermute(ref()(3)(2),perm); } #define PERMUTE_DIR(dir) ; #else -#define LOAD_CHIMU \ +#define LOAD_CHIMU(ptype) \ {const SiteSpinor & ref (in[offset]); \ Chimu_00=ref()(0)(0);\ Chimu_01=ref()(0)(1);\ @@ -482,19 +482,19 @@ Author: paboyle Simd U_11; \ Simd U_21; -#define ZERO_RESULT \ - result_00=S(0.0,0.0); \ - result_01=S(0.0,0.0); \ - result_02=S(0.0,0.0); \ - result_10=S(0.0,0.0); \ - result_11=S(0.0,0.0); \ - result_12=S(0.0,0.0); \ - result_20=S(0.0,0.0); \ - result_21=S(0.0,0.0); \ - result_22=S(0.0,0.0); \ - result_30=S(0.0,0.0); \ - result_31=S(0.0,0.0); \ - result_32=S(0.0,0.0); +#define ZERO_RESULT \ + zeroit(result_00); \ + zeroit(result_01); \ + zeroit(result_02); \ + zeroit(result_10); \ + zeroit(result_11); \ + zeroit(result_12); \ + zeroit(result_20); \ + zeroit(result_21); \ + zeroit(result_22); \ + zeroit(result_30); \ + zeroit(result_31); \ + zeroit(result_32); #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 2c1a38e7..b2c7588f 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -67,6 +67,7 @@ public: accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;}; accelerator_inline Real real(void) const { return z.x; }; accelerator_inline Real imag(void) const { return z.y; }; + accelerator_inline GpuComplex &operator=(const Zero &zz) { z.x = 0; z.y=0; return *this; }; accelerator_inline GpuComplex &operator*=(const GpuComplex &r) { *this = (*this) * r; return *this; diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index be7c89c0..2ce48369 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -7,20 +7,20 @@ template class SimpleCompressor { public: void Point(int) {}; - accelerator_inline int CommDatumSize(void) { return sizeof(vobj); } - accelerator_inline bool DecompressionStep(void) { return false; } - template accelerator_inline void Compress(cobj *buf,int o,const cobj &in) { buf[o]=in; } - accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){ + accelerator_inline int CommDatumSize(void) const { return sizeof(vobj); } + accelerator_inline bool DecompressionStep(void) const { return false; } + template accelerator_inline void Compress(cobj *buf,int o,const cobj &in) const { buf[o]=in; } + accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o) const { exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); } - accelerator_inline void Decompress(vobj *out,vobj *in, int o){ assert(0); } + accelerator_inline void Decompress(vobj *out,vobj *in, int o) const { assert(0); } accelerator_inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, - int j,int k, int m,int type){ + int j,int k, int m,int type) const { exchange(out0[j],out1[j],in[k],in[m],type); } // For cshift. Cshift should drop compressor coupling altogether // because I had to decouple the code from the Stencil anyway - accelerator_inline vobj operator() (const vobj &arg) { + accelerator_inline vobj operator() (const vobj &arg) const { return arg; } }; diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 23fc8203..58cebed3 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -147,16 +147,16 @@ class CartesianStencilAccelerator { cobj* u_recv_buf_p; cobj* u_send_buf_p; - accelerator_inline cobj *CommBuf(void) { return u_recv_buf_p; } + accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; } - accelerator_inline int GetNodeLocal(int osite,int point) { + accelerator_inline int GetNodeLocal(int osite,int point) const { return this->_entries_p[point+this->_npoints*osite]._is_local; } - accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) { + accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const { ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; } - accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { + accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; local = this->_entries_p[ent]._is_local; perm = this->_entries_p[ent]._permute; @@ -168,14 +168,14 @@ class CartesianStencilAccelerator { } } - accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) { + accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) const { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; int local = this->_entries_p[ent]._is_local; if (local) return base + this->_entries_p[ent]._byte_offset; else return cbase + this->_entries_p[ent]._byte_offset; } - accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) + accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) const { Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); } @@ -221,7 +221,7 @@ public: typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_object scalar_object; - typedef CartesianStencilView View_type; + typedef const CartesianStencilView View_type; typedef typename View_type::StencilVector StencilVector; /////////////////////////////////////////// // Helper structs diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index ede24fbe..672f385f 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -66,7 +66,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ #ifndef GRID_SYCL -// Use the scalar as our own complex on GPU +// Use the scalar as our own complex on GPU ... thrust::complex or std::complex template = 0> accelerator_inline typename vsimd::scalar_type coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) @@ -96,6 +96,8 @@ void coalescedWrite(vsimd & __restrict__ vec, p[lane]=extracted; } #else +// For SyCL have option to use GpuComplex from inside the vector type in SIMT loops +// Faster for some reason template = 0> accelerator_inline typename vsimd::vector_type::datum coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 2b7bf53a..f1a694fb 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -456,7 +456,7 @@ accelerator_inline void acceleratorSynchronise(void) __syncwarp(); #endif #ifdef GRID_SYCL - // No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier + cl::sycl::detail::workGroupBarrier(); #endif #ifdef GRID_HIP __syncthreads(); diff --git a/configure.ac b/configure.ac index fb0c78fc..5f165412 100644 --- a/configure.ac +++ b/configure.ac @@ -140,12 +140,23 @@ AC_ARG_ENABLE([gparity], [ac_GPARITY=${enable_gparity}], [ac_GPARITY=yes]) AM_CONDITIONAL(BUILD_GPARITY, [ test "${ac_GPARITY}X" == "yesX" ]) + +AC_ARG_ENABLE([zmobius], + [AC_HELP_STRING([--enable-zmobius=yes|no], [enable Zmobius support])], + [ac_ZMOBIUS=${enable_zmobius}], [ac_ZMOBIUS=yes]) + +AM_CONDITIONAL(BUILD_ZMOBIUS, [ test "${ac_ZMOBIUS}X" == "yesX" ]) + + case ${ac_FERMION_REPS} in yes) AC_DEFINE([ENABLE_FERMION_REPS],[1],[non QCD fermion reps]);; esac case ${ac_GPARITY} in yes) AC_DEFINE([ENABLE_GPARITY],[1],[fermion actions with GPARITY BCs]);; esac +case ${ac_ZMOBIUS} in + yes) AC_DEFINE([ENABLE_ZMOBIUS],[1],[Zmobius fermion actions]);; +esac ############### Nc AC_ARG_ENABLE([Nc], [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],