diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index e74d3894..c62d9cdb 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -34,6 +34,12 @@ #define __SYCL__REDEFINE__ #endif +/* HIP save and restore compile environment*/ +#ifdef GRID_HIP +#pragma push +#pragma push_macro("__HIP_DEVICE_COMPILE__") +#endif +#define EIGEN_NO_HIP #include #include @@ -52,6 +58,12 @@ #pragma pop #endif +/*HIP restore*/ +#ifdef __HIP__REDEFINE__ +#pragma pop_macro("__HIP_DEVICE_COMPILE__") +#pragma pop +#endif + #if defined __GNUC__ #pragma GCC diagnostic pop #endif diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 436d75ef..bb06d43f 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -138,21 +138,6 @@ public: int recv_from_rank, int bytes); - void SendRecvPacket(void *xmit, - void *recv, - int xmit_to_rank, - int recv_from_rank, - int bytes); - - void SendToRecvFromBegin(std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); - - void SendToRecvFromComplete(std::vector &waitall); - double StencilSendToRecvFrom(void *xmit, int xmit_to_rank, void *recv, diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 81900371..6cb431a2 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){} void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){} -void CartesianCommunicator::SendRecvPacket(void *xmit, - void *recv, - int xmit_to_rank, - int recv_from_rank, - int bytes) -{ - assert(0); -} - // Basic Halo comms primitive -- should never call in single node void CartesianCommunicator::SendToRecvFrom(void *xmit, @@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, { assert(0); } -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - assert(0); -} - -void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) -{ - assert(0); -} void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) { bcopy(in,out,bytes*words); @@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int recv_from_rank, int bytes, int dir) { - std::vector list; - // Discard the "dir" - SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); - SendToRecvFromComplete(list); return 2.0*bytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, @@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall,int dir) { - SendToRecvFromComplete(waitall); } void CartesianCommunicator::StencilBarrier(void){}; diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 8b27ab7a..0cbde9eb 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -32,6 +32,9 @@ Author: Peter Boyle #ifdef GRID_CUDA #include #endif +#ifdef GRID_HIP +#include +#endif NAMESPACE_BEGIN(Grid); #define header "SharedMemoryMpi: " @@ -425,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended //////////////////////////////////////////////////////////////////////////////////////////// -#ifdef GRID_CUDA +#if defined(GRID_CUDA) ||defined(GRID_HIP) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) { void * ShmCommBuf ; @@ -448,21 +451,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// -#ifndef GRID_MPI3_SHM_NONE - auto err = cudaMalloc(&ShmCommBuf, bytes); -#else - auto err = cudaMallocManaged(&ShmCommBuf, bytes); -#endif - if ( err != cudaSuccess) { - std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " < &ret,const Lattice &lhs,const Lattice &rhs){ autoView( lhs_v , lhs, AcceleratorRead); autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ - decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); auto rhs_t=rhs_v(ss); + auto tmp =ret_v(ss); mac(&tmp,&lhs_t,&rhs_t); coalescedWrite(ret_v[ss],tmp); }); @@ -124,7 +124,7 @@ void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ autoView( ret_v , ret, AcceleratorWrite); autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ - decltype(coalescedRead(obj1())) tmp; + auto tmp =ret_v(ss); auto lhs_t=lhs_v(ss); mac(&tmp,&lhs_t,&rhs); coalescedWrite(ret_v[ss],tmp); @@ -182,7 +182,7 @@ void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ autoView( ret_v , ret, AcceleratorWrite); autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ - decltype(coalescedRead(obj1())) tmp; + auto tmp =ret_v(ss); auto rhs_t=rhs_v(ss); mac(&tmp,&lhs,&rhs_t); coalescedWrite(ret_v[ss],tmp); diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 5f490507..d8a47ae1 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid); #ifdef GRID_HIP extern hipDeviceProp_t *gpu_props; +#define WARP_SIZE 64 #endif #ifdef GRID_CUDA extern cudaDeviceProp *gpu_props; +#define WARP_SIZE 32 #endif -#define WARP_SIZE 32 __device__ unsigned int retirementCount = 0; template @@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid // cannot use overloaded operators for sobj as they are not volatile-qualified memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); - __syncwarp(); + acceleratorSynchronise(); const Iterator VEC = WARP_SIZE; const Iterator vid = tid & (VEC-1); @@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid beta += temp; memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); } - __syncwarp(); + acceleratorSynchronise(); } - __syncthreads(); + acceleratorSynchroniseAll(); if (threadIdx.x == 0) { beta = Zero(); @@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid } memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); } - __syncthreads(); + acceleratorSynchroniseAll(); } diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h index 30deee06..d67105bb 100644 --- a/Grid/qcd/action/fermion/StaggeredKernels.h +++ b/Grid/qcd/action/fermion/StaggeredKernels.h @@ -63,17 +63,20 @@ template class StaggeredKernels : public FermionOperator , pub /////////////////////////////////////////////////////////////////////////////////////// // Generic Nc kernels /////////////////////////////////////////////////////////////////////////////////////// - template accelerator_inline + template + static accelerator_inline void DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, @@ -82,17 +85,20 @@ template class StaggeredKernels : public FermionOperator , pub /////////////////////////////////////////////////////////////////////////////////////// // Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// - template accelerator_inline + + template static accelerator_inline void DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - template accelerator_inline + + template static accelerator_inline void DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, @@ -101,6 +107,7 @@ template class StaggeredKernels : public FermionOperator , pub /////////////////////////////////////////////////////////////////////////////////////// // Asm Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// + void DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index e79b64dc..b3fbe096 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -799,7 +799,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, PropagatorField tmp(UGrid); PropagatorField Utmp(UGrid); - LatticeInteger zz (UGrid); zz=0.0; + PropagatorField zz (UGrid); zz=0.0; LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); for (int s=0;s::SeqConservedCurrent(PropagatorField &q_in, PropagatorField tmp(UGrid); PropagatorField Utmp(UGrid); - LatticeInteger zz (UGrid); zz=0.0; + PropagatorField zz (UGrid); zz=0.0; LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); for(int s=0;s -template +template accelerator_inline void StaggeredKernels::DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -221,7 +221,7 @@ void StaggeredKernels::DhopSiteHand(StencilView &st, template -template +template accelerator_inline void StaggeredKernels::DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -300,7 +300,7 @@ void StaggeredKernels::DhopSiteHandInt(StencilView &st, template -template +template accelerator_inline void StaggeredKernels::DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index 141725a7..0b6f9fb0 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -78,7 +78,7 @@ StaggeredKernels::StaggeredKernels(const ImplParams &p) : Base(p){}; // Int, Ext, Int+Ext cases for comms overlap //////////////////////////////////////////////////////////////////////////////////// template -template +template accelerator_inline void StaggeredKernels::DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -126,7 +126,7 @@ void StaggeredKernels::DhopSiteGeneric(StencilView &st, // Only contributions from interior of our node /////////////////////////////////////////////////// template -template +template accelerator_inline void StaggeredKernels::DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -174,7 +174,7 @@ void StaggeredKernels::DhopSiteGenericInt(StencilView &st, // Only contributions from exterior of our node /////////////////////////////////////////////////// template -template +template accelerator_inline void StaggeredKernels::DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, @@ -224,7 +224,7 @@ void StaggeredKernels::DhopSiteGenericExt(StencilView &st, //////////////////////////////////////////////////////////////////////////////////// // Driving / wrapping routine to select right kernel //////////////////////////////////////////////////////////////////////////////////// -template +template void StaggeredKernels::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp) { @@ -253,7 +253,7 @@ void StaggeredKernels::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ }); -template +template void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior,int exterior) @@ -293,7 +293,7 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, } assert(0 && " Kernel optimisation case not covered "); } -template +template void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior,int exterior) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h index 2150938f..a592a798 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h @@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid); HAND_RESULT_EXT(ss,F) #define HAND_SPECIALISE_GPARITY(IMPL) \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid); HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ @@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid); nmu = 0; \ HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ } \ - template<> void \ + template<> accelerator_inline void \ WilsonKernels::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ { \ diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index f7b018fa..89ae5668 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -495,7 +495,7 @@ Author: paboyle NAMESPACE_BEGIN(Grid); -template void +template accelerator_inline void WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -519,7 +519,7 @@ WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site HAND_RESULT(ss); } -template +template accelerator_inline void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -542,7 +542,7 @@ void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView HAND_RESULT(ss); } -template void +template accelerator_inline void WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -566,7 +566,7 @@ WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si HAND_RESULT(ss); } -template +template accelerator_inline void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -589,7 +589,7 @@ void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi HAND_RESULT(ss); } -template void +template accelerator_inline void WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { @@ -614,7 +614,7 @@ WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si HAND_RESULT_EXT(ss); } -template +template accelerator_inline void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index c2b62416..c5f50bbb 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) //////////////////////////////////////////////////////////////////// // All legs kernels ; comms then compute //////////////////////////////////////////////////////////////////// -template +template accelerator_inline void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -140,7 +140,7 @@ void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV coalescedWrite(out[sF],result,lane); }; -template +template accelerator_inline void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -169,7 +169,7 @@ void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView //////////////////////////////////////////////////////////////////// // Interior kernels //////////////////////////////////////////////////////////////////// -template +template accelerator_inline void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -197,7 +197,7 @@ void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi coalescedWrite(out[sF], result,lane); }; -template +template accelerator_inline void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -227,7 +227,7 @@ void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeField //////////////////////////////////////////////////////////////////// // Exterior kernels //////////////////////////////////////////////////////////////////// -template +template accelerator_inline void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -258,7 +258,7 @@ void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi } }; -template +template accelerator_inline void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) @@ -290,7 +290,7 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField }; #define DhopDirMacro(Dir,spProj,spRecon) \ - template \ + template accelerator_inline \ void WilsonKernels::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ { \ @@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm); DhopDirMacro(Zm,spProjZm,spReconZm); DhopDirMacro(Tm,spProjTm,spReconTm); -template +template accelerator_inline void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) { diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index b9c6a81b..8b17f75a 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -41,6 +41,11 @@ Author: Peter Boyle namespace Grid { +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +typedef struct { uint16_t x;} half; +#endif +typedef struct Half2_t { half x; half y; } Half2; + #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) template @@ -125,14 +130,14 @@ inline accelerator GpuVector operator/(const GpuVector l,const } constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half); -constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2); +constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2); constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float); constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2); constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double); constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2); constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer); -typedef GpuComplex GpuComplexH; +typedef GpuComplex GpuComplexH; typedef GpuComplex GpuComplexF; typedef GpuComplex GpuComplexD; @@ -147,11 +152,9 @@ typedef GpuVector GpuVectorI; accelerator_inline float half2float(half h) { float f; -#ifdef GRID_SIMT +#if defined(GRID_CUDA) || defined(GRID_HIP) f = __half2float(h); #else - //f = __half2float(h); - __half_raw hr(h); Grid_half hh; hh.x = hr.x; f= sfw_half_to_float(hh); @@ -161,13 +164,11 @@ accelerator_inline float half2float(half h) accelerator_inline half float2half(float f) { half h; -#ifdef GRID_SIMT +#if defined(GRID_CUDA) || defined(GRID_HIP) h = __float2half(f); #else Grid_half hh = sfw_float_to_half(f); - __half_raw hr; - hr.x = hh.x; - h = __half(hr); + h.x = hh.x; #endif return h; } @@ -523,7 +524,7 @@ namespace Optimization { //////////////////////////////////////////////////////////////////////////////////// // Single / Half //////////////////////////////////////////////////////////////////////////////////// - static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { + static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { int N = GpuVectorCF::N; GpuVectorCH h; for(int i=0;i({45,12,81,9})); + + LatticeColourMatrix z(&Grid); random(pRNG,z); + LatticeColourMatrix x(&Grid); random(pRNG,x); + LatticeColourMatrix y(&Grid); random(pRNG,y); + + for(int64_t i=0;i