From 33097681b92df90ee17a871ecfeb6b02fbd2c5d0 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 14 Oct 2023 00:42:55 +0300
Subject: [PATCH] FTHMC compiled and merged to develop

---
 Grid/lattice/Lattice_reduction_gpu.h |  2 +-
 Grid/qcd/utils/GaugeGroup.h          | 62 +++++++++++++++++++++++++++-
 Grid/qcd/utils/SUn.impl.h            |  2 +
 Grid/threads/Accelerator.cc          |  2 +-
 Grid/threads/Accelerator.h           | 16 +++----
 systems/Lumi/config-command          |  2 +-
 6 files changed, 73 insertions(+), 13 deletions(-)
diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index ecf90d198..e82494f57 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
   cudaGetDevice(&device);
 #endif
 #ifdef GRID_HIP
-  hipGetDevice(&device);
+  auto r=hipGetDevice(&device);
 #endif
   
   Iterator warpSize            = gpu_props[device].warpSize;
diff --git a/Grid/qcd/utils/GaugeGroup.h b/Grid/qcd/utils/GaugeGroup.h
index f92064f43..6811d247e 100644
--- a/Grid/qcd/utils/GaugeGroup.h
+++ b/Grid/qcd/utils/GaugeGroup.h
@@ -100,6 +100,9 @@ class GaugeGroup {
   using iGroupMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
   template <typename vtype>
   using iAlgebraVector = iScalar<iScalar<iVector<vtype, AdjointDimension> > >;
+  template <typename vtype>
+  using iSUnAlgebraMatrix =
+    iScalar<iScalar<iMatrix<vtype, AdjointDimension> > >;
   static int su2subgroups(void) { return su2subgroups(group_name()); }
 
   //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -128,10 +131,19 @@ class GaugeGroup {
   typedef Lattice<vMatrix> LatticeMatrix;
   typedef Lattice<vMatrixF> LatticeMatrixF;
   typedef Lattice<vMatrixD> LatticeMatrixD;
-
+  
   typedef Lattice<vAlgebraVector> LatticeAlgebraVector;
   typedef Lattice<vAlgebraVectorF> LatticeAlgebraVectorF;
   typedef Lattice<vAlgebraVectorD> LatticeAlgebraVectorD;
+   
+  typedef iSUnAlgebraMatrix<vComplex>  vAlgebraMatrix;
+  typedef iSUnAlgebraMatrix<vComplexF> vAlgebraMatrixF;
+  typedef iSUnAlgebraMatrix<vComplexD> vAlgebraMatrixD;
+
+  typedef Lattice<vAlgebraMatrix>  LatticeAlgebraMatrix;
+  typedef Lattice<vAlgebraMatrixF> LatticeAlgebraMatrixF;
+  typedef Lattice<vAlgebraMatrixD> LatticeAlgebraMatrixD;
+  
 
   typedef iSU2Matrix<Complex> SU2Matrix;
   typedef iSU2Matrix<ComplexF> SU2MatrixF;
@@ -160,7 +172,7 @@ class GaugeGroup {
     return generator(lieIndex, ta, group_name());
   }
 
-  static void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
+  static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
     return su2SubGroupIndex(i1, i2, su2_index, group_name());
   }
 
@@ -389,6 +401,52 @@ class GaugeGroup {
     }
   }
 
+// Ta are hermitian (?)
+// Anti herm is i Ta basis
+static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, int b)
+{
+  conformable(in, out);
+  GridBase *grid = out.Grid();
+  LatticeComplex tmp(grid);
+  Matrix ta;
+  // Using Luchang's projection convention
+  //  2 Tr{Ta Tb} A_b= 2/2 delta ab A_b = A_a
+  autoView(out_v,out,AcceleratorWrite);
+  autoView(in_v,in,AcceleratorRead);
+  int N = ncolour;
+  int NNm1 = N * (N - 1);
+  int hNNm1= NNm1/2;
+  RealD sqrt_2 = sqrt(2.0);
+  Complex ci(0.0,1.0);
+  for(int su2Index=0;su2Index<hNNm1;su2Index++){
+    int i1, i2;
+    su2SubGroupIndex(i1, i2, su2Index);
+    int ax = su2Index*2;
+    int ay = su2Index*2+1;
+    accelerator_for(ss,grid->oSites(),1,{
+	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
+	// trace( Ta x Ci in)
+	// Bet I need to move to real part with mult by -i
+	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
+	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
+      });
+  }
+  for(int diagIndex=0;diagIndex<N-1;diagIndex++){
+    int k = diagIndex + 1; // diagIndex starts from 0
+    int a = NNm1+diagIndex;
+    RealD scale = 1.0/sqrt(2.0*k*(k+1));
+    accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
+	auto tmp = in_v[ss]()()(0,0);
+	for(int i=1;i<k;i++){
+	  tmp=tmp+in_v[ss]()()(i,i);
+	}
+	tmp = tmp - in_v[ss]()()(k,k)*k;
+	out_v[ss]()()(a,b) =imag(tmp) * scale;
+      });
+    }
+}
+
+  
 };
     
 template <int ncolour>
diff --git a/Grid/qcd/utils/SUn.impl.h b/Grid/qcd/utils/SUn.impl.h
index e19f970cb..02fa161be 100644
--- a/Grid/qcd/utils/SUn.impl.h
+++ b/Grid/qcd/utils/SUn.impl.h
@@ -10,6 +10,7 @@
 // doesn't get found by the scripts/filelist during bootstrapping.
 
 private:
+
 template <ONLY_IF_SU>
 static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; }
 ////////////////////////////////////////////////////////////////////////
@@ -576,3 +577,4 @@ static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeFie
   LieRandomize(pRNG,g,1.0);
   GaugeTransform<Gimpl>(Umu,g);
 }
+
diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc
index 70f469b00..3769b2aa6 100644
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -147,7 +147,7 @@ void acceleratorInit(void)
 #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
 #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
     
-    hipGetDeviceProperties(&gpu_props[i], i);
+    auto r=hipGetDeviceProperties(&gpu_props[i], i);
     hipDeviceProp_t prop; 
     prop = gpu_props[i];
     totalDeviceMem = prop.totalGlobalMem;
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index f362a0777..ff5ccd7a3 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -405,7 +405,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 
 #define accelerator_barrier(dummy)				\
   {								\
-    hipStreamSynchronize(computeStream);			\
+    auto r=hipStreamSynchronize(computeStream);			\
     auto err = hipGetLastError();				\
     if ( err != hipSuccess ) {					\
       printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@@ -438,19 +438,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
   return ptr;
 };
 
-inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
-inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
+inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
+inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
 //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
 //inline void acceleratorCopySynchronise(void) {  }
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
 
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 {
-  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+  auto r=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 }
-inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
+inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyStream); };
 
 #endif
 
diff --git a/systems/Lumi/config-command b/systems/Lumi/config-command
index 3f7877c88..5e5962852 100644
--- a/systems/Lumi/config-command
+++ b/systems/Lumi/config-command
@@ -23,7 +23,7 @@ echo mpfr X$MPFR
 --disable-fermion-reps \
 --disable-gparity \
 CXX=hipcc MPICXX=mpicxx \
-  CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
+  CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++17 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
   LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp"