diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index ecf90d19..e82494f5 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator & cudaGetDevice(&device); #endif #ifdef GRID_HIP - hipGetDevice(&device); + auto r=hipGetDevice(&device); #endif Iterator warpSize = gpu_props[device].warpSize; diff --git a/Grid/qcd/utils/GaugeGroup.h b/Grid/qcd/utils/GaugeGroup.h index f92064f4..6811d247 100644 --- a/Grid/qcd/utils/GaugeGroup.h +++ b/Grid/qcd/utils/GaugeGroup.h @@ -100,6 +100,9 @@ class GaugeGroup { using iGroupMatrix = iScalar > >; template using iAlgebraVector = iScalar > >; + template + using iSUnAlgebraMatrix = + iScalar > >; static int su2subgroups(void) { return su2subgroups(group_name()); } ////////////////////////////////////////////////////////////////////////////////////////////////// @@ -128,10 +131,19 @@ class GaugeGroup { typedef Lattice LatticeMatrix; typedef Lattice LatticeMatrixF; typedef Lattice LatticeMatrixD; - + typedef Lattice LatticeAlgebraVector; typedef Lattice LatticeAlgebraVectorF; typedef Lattice LatticeAlgebraVectorD; + + typedef iSUnAlgebraMatrix vAlgebraMatrix; + typedef iSUnAlgebraMatrix vAlgebraMatrixF; + typedef iSUnAlgebraMatrix vAlgebraMatrixD; + + typedef Lattice LatticeAlgebraMatrix; + typedef Lattice LatticeAlgebraMatrixF; + typedef Lattice LatticeAlgebraMatrixD; + typedef iSU2Matrix SU2Matrix; typedef iSU2Matrix SU2MatrixF; @@ -160,7 +172,7 @@ class GaugeGroup { return generator(lieIndex, ta, group_name()); } - static void su2SubGroupIndex(int &i1, int &i2, int su2_index) { + static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index) { return su2SubGroupIndex(i1, i2, su2_index, group_name()); } @@ -389,6 +401,52 @@ class GaugeGroup { } } +// Ta are hermitian (?) +// Anti herm is i Ta basis +static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, int b) +{ + conformable(in, out); + GridBase *grid = out.Grid(); + LatticeComplex tmp(grid); + Matrix ta; + // Using Luchang's projection convention + // 2 Tr{Ta Tb} A_b= 2/2 delta ab A_b = A_a + autoView(out_v,out,AcceleratorWrite); + autoView(in_v,in,AcceleratorRead); + int N = ncolour; + int NNm1 = N * (N - 1); + int hNNm1= NNm1/2; + RealD sqrt_2 = sqrt(2.0); + Complex ci(0.0,1.0); + for(int su2Index=0;su2IndexoSites(),1,{ + // in is traceless ANTI-hermitian whereas Grid generators are Hermitian. + // trace( Ta x Ci in) + // Bet I need to move to real part with mult by -i + out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2))); + out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1))); + }); + } + for(int diagIndex=0;diagIndexoSites(),vComplex::Nsimd(),{ + auto tmp = in_v[ss]()()(0,0); + for(int i=1;i diff --git a/Grid/qcd/utils/SUn.impl.h b/Grid/qcd/utils/SUn.impl.h index e19f970c..02fa161b 100644 --- a/Grid/qcd/utils/SUn.impl.h +++ b/Grid/qcd/utils/SUn.impl.h @@ -10,6 +10,7 @@ // doesn't get found by the scripts/filelist during bootstrapping. private: + template static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; } //////////////////////////////////////////////////////////////////////// @@ -576,3 +577,4 @@ static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeFie LieRandomize(pRNG,g,1.0); GaugeTransform(Umu,g); } + diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 70f469b0..3769b2aa 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -147,7 +147,7 @@ void acceleratorInit(void) #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); - hipGetDeviceProperties(&gpu_props[i], i); + auto r=hipGetDeviceProperties(&gpu_props[i], i); hipDeviceProp_t prop; prop = gpu_props[i]; totalDeviceMem = prop.totalGlobalMem; diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index f362a077..ff5ccd7a 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -405,7 +405,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) #define accelerator_barrier(dummy) \ { \ - hipStreamSynchronize(computeStream); \ + auto r=hipStreamSynchronize(computeStream); \ auto err = hipGetLastError(); \ if ( err != hipSuccess ) { \ printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ @@ -438,19 +438,19 @@ inline void *acceleratorAllocDevice(size_t bytes) return ptr; }; -inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; -inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} +inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);}; +inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);}; +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} //inline void acceleratorCopySynchronise(void) { } -inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch { - hipMemcpyDtoDAsync(to,from,bytes, copyStream); + auto r=hipMemcpyDtoDAsync(to,from,bytes, copyStream); } -inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); }; +inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyStream); }; #endif diff --git a/systems/Lumi/config-command b/systems/Lumi/config-command index 3f7877c8..5e596285 100644 --- a/systems/Lumi/config-command +++ b/systems/Lumi/config-command @@ -23,7 +23,7 @@ echo mpfr X$MPFR --disable-fermion-reps \ --disable-gparity \ CXX=hipcc MPICXX=mpicxx \ - CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \ + CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++17 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \ LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp"