Config command

Working on NVCC
Vanilla linux compile, assuming spack prerequisites
2025-06-14 13:57:07 +01:00 · 2024-10-18 13:58:33 -04:00 · 2024-10-18 13:58:03 -04:00 · 2024-10-18 13:57:40 -04:00 · 2024-10-18 13:56:53 -04:00 · 2024-10-18 13:56:24 -04:00
8 changed files with 486 additions and 126 deletions
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>


 NAMESPACE_BEGIN(Grid); 
-
+const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
  typedef typename vobj::vector_type vector_type;
@ -65,10 +65,10 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
    Cshift_comms(ret,rhs,dimension,shift);
  }
  t1=usecond();
-  //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
+  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }
-
+#if 1
 template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
@ -175,11 +175,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      tscatter+=usecond();
    }
  }
-  std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  if (Cshift_verbose){
+    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  }
 }

 template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -301,12 +303,243 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
    tscatter+=usecond();
  }
-  std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-  std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  if(Cshift_verbose){
+    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  }
 }
+#else 
+template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  GridBase *grid=rhs.Grid();
+  Lattice<vobj> temp(rhs.Grid());
+
+  int fd              = rhs.Grid()->_fdimensions[dimension];
+  int rd              = rhs.Grid()->_rdimensions[dimension];
+  int pd              = rhs.Grid()->_processors[dimension];
+  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
+  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
+  assert(simd_layout==1);
+  assert(comm_dim==1);
+  assert(shift>=0);
+  assert(shift<fd);
+  RealD tcopy=0.0;
+  RealD tgather=0.0;
+  RealD tscatter=0.0;
+  RealD tcomms=0.0;
+  uint64_t xbytes=0;
+  
+  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
+  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
+  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
+  vobj *send_buf;
+  vobj *recv_buf;
+  {
+    grid->ShmBufferFreeAll();
+    size_t bytes = buffer_size*sizeof(vobj);
+    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
+    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
+  }
+    
+  int cb= (cbmask==0x2)? Odd : Even;
+  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
+
+  for(int x=0;x<rd;x++){       
+
+    int sx        =  (x+sshift)%rd;
+    int comm_proc = ((x+sshift)/rd)%pd;
+    
+    if (comm_proc==0) {
+
+      tcopy-=usecond();
+      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+      tcopy+=usecond();
+
+    } else {
+
+      int words = buffer_size;
+      if (cbmask != 0x3) words=words>>1;
+
+      int bytes = words * sizeof(vobj);
+
+      tgather-=usecond();
+      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
+      tgather+=usecond();
+
+      //      int rank           = grid->_processor;
+      int recv_from_rank;
+      int xmit_to_rank;
+      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+
+
+      tcomms-=usecond();
+      //      grid->Barrier();
+
+      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
+      grid->SendToRecvFrom((void *)&send_buf[0],
+			   xmit_to_rank,
+			   (void *)&recv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      xbytes+=bytes;
+      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
+
+      //      grid->Barrier();
+      tcomms+=usecond();
+
+      tscatter-=usecond();
+      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
+      tscatter+=usecond();
+    }
+  }
+  if(Cshift_verbose){
+    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
+  }
+}
+
+template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  GridBase *grid=rhs.Grid();
+  const int Nsimd = grid->Nsimd();
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+   
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int pd = grid->_processors[dimension];
+  int simd_layout     = grid->_simd_layout[dimension];
+  int comm_dim        = grid->_processors[dimension] >1 ;
+
+  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+
+  assert(comm_dim==1);
+  assert(simd_layout==2);
+  assert(shift>=0);
+  assert(shift<fd);
+  RealD tcopy=0.0;
+  RealD tgather=0.0;
+  RealD tscatter=0.0;
+  RealD tcomms=0.0;
+  uint64_t xbytes=0;
+
+  int permute_type=grid->PermuteType(dimension);
+
+  ///////////////////////////////////////////////
+  // Simd direction uses an extract/merge pair
+  ///////////////////////////////////////////////
+  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
+  //  int words = sizeof(vobj)/sizeof(vector_type);
+
+  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
+  scalar_object *  recv_buf_extract_mpi;
+  scalar_object *  send_buf_extract_mpi;
+  {
+    size_t bytes = sizeof(scalar_object)*buffer_size;
+    grid->ShmBufferFreeAll();
+    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
+    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
+  }
+  for(int s=0;s<Nsimd;s++){
+    send_buf_extract[s].resize(buffer_size);
+    recv_buf_extract[s].resize(buffer_size);
+  }
+
+  int bytes = buffer_size*sizeof(scalar_object);
+
+  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
+  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
+
+  ///////////////////////////////////////////
+  // Work out what to send where
+  ///////////////////////////////////////////
+  int cb    = (cbmask==0x2)? Odd : Even;
+  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
+
+  // loop over outer coord planes orthog to dim
+  for(int x=0;x<rd;x++){       
+
+    // FIXME call local permute copy if none are offnode.
+    for(int i=0;i<Nsimd;i++){       
+      pointers[i] = &send_buf_extract[i][0];
+    }
+    tgather-=usecond();
+    int sx   = (x+sshift)%rd;
+    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+    tgather+=usecond();
+
+    for(int i=0;i<Nsimd;i++){
+      
+      int inner_bit = (Nsimd>>(permute_type+1));
+      int ic= (i&inner_bit)? 1:0;
+
+      int my_coor          = rd*ic + x;
+      int nbr_coor         = my_coor+sshift;
+      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
+
+      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
+      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
+      int nbr_lane = (i&(~inner_bit));
+
+      int recv_from_rank;
+      int xmit_to_rank;
+
+      if (nbr_ic) nbr_lane|=inner_bit;
+
+      assert (sx == nbr_ox);
+
+      if(nbr_proc){
+	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
+
+	tcomms-=usecond();
+	//	grid->Barrier();
+
+	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
+	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
+			     xmit_to_rank,
+			     (void *)recv_buf_extract_mpi,
+			     recv_from_rank,
+			     bytes);
+	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
+	xbytes+=bytes;
+
+	//	grid->Barrier();
+	tcomms+=usecond();
+	rpointers[i] = &recv_buf_extract[i][0];
+      } else { 
+	rpointers[i] = &send_buf_extract[nbr_lane][0];
+      }
+
+    }
+    tscatter-=usecond();
+    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
+    tscatter+=usecond();
+
+  }
+  if(Cshift_verbose){
+    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
+    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
+  }
+}
+#endif
+
 NAMESPACE_END(Grid); 

 #endif
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -522,11 +522,14 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
  int ostride=grid->_ostride[orthogdim];
  
  //Reduce Data down to lvSum
+  RealD t_sum =-usecond();
  sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
+  t_sum +=usecond();

  // Sum across simd lanes in the plane, breaking out orthog dir.
  Coordinate icoor(Nd);

+  RealD t_rest =-usecond();
  for(int rt=0;rt<rd;rt++){

    extract(lvSum[rt],extracted);
@ -556,6 +559,9 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
  scalar_type * ptr = (scalar_type *) &result[0];
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
  grid->GlobalSumVector(ptr, words);
+  t_rest +=usecond();
+  std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
+  
 }
 template<class vobj> inline
 std::vector<typename vobj::scalar_object> 
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@ -6,6 +6,34 @@ NAMESPACE_BEGIN(Grid);

 #undef DELTA_F_EQ_2

+///////////////////////////////////////////////////////////////////
+//Meson 
+// Interested in
+//
+//      sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
+//
+// Conventional meson field:
+//                 
+//    = sum_x,y Trace[ sum_j G |v_j(y,ty)> <w_j(x,tx)|  G sum_i |v_i(x,tx) ><w_i(y,ty)| ]
+//    = sum_ij sum_x,y < w_j(x,tx)| G |v_i(x,tx) > <w_i(y,ty) (x)|G| v_j(y,ty) >
+//    = sum_ij PI_ji(tx) PI_ij(ty)
+//
+// G5-Hermiticity
+//
+//      sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
+//    = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ]
+//    = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> <w_j(x,tx)|  G g5 sum_i   (|v_j(y,ty)> <w_i(x,tx)|)^dag ]      --  (*)
+//
+// NB:  Dag applies to internal indices spin,colour,complex
+//
+//    = sum_ij sum_x,y Trace[ g5 G |v_j(y,ty)> <w_j(x,tx)|  G g5  |w_i(x,tx)> <v_i(y,ty)| ]
+//    = sum_ij sum_x,y <v_i(y,ty)|g5 G |v_j(y,ty)> <w_j(x,tx)|  G g5 |w_i(x,tx)> 
+//    = sum_ij  PionVV(ty) PionWW(tx)
+//
+// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker
+//     expectation value. Otherwise biased.
+////////////////////////////////////////////////////////////////////
+
 template <typename FImpl>
 class A2Autils 
 {
@ -26,7 +54,9 @@ public:

  typedef iSpinColourMatrix<vector_type> SpinColourMatrix_v;

-  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  
+  // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  template <typename TensorType> 
  static void MesonField(TensorType &mat, 
 			 const FermionField *lhs_wi,
 			 const FermionField *rhs_vj,
@ -34,6 +64,14 @@ public:
 			 const std::vector<ComplexField > &mom,
 			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);

+  template <typename TensorType> 
+  static void MesonFieldGPU(TensorType &mat, 
+			 const FermionField *lhs_wi,
+			 const FermionField *rhs_vj,
+			 std::vector<Gamma::Algebra> gammas,
+			 const std::vector<ComplexField > &mom,
+			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
+  /*
  static void PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
 			     const FermionField *wi,
 			     const FermionField *vj,
@ -58,7 +96,8 @@ public:
 			  const FermionField *vi,
 			  const FermionField *vj,
 			  int orthogdim);
-
+  */
+  
  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
  static void AslashField(TensorType &mat, 
        const FermionField *lhs_wi,
@ -159,14 +198,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  int MFlvol = ld*Lblock*Rblock*Nmom;

  std::vector<SpinMatrix_v > lvSum(MFrvol);
-  thread_for( r, MFrvol,{
+  for(int r=0;r<MFrvol;r++){
    lvSum[r] = Zero();
-  });
+  }

  std::vector<SpinMatrix_s > lsSum(MFlvol);             
-  thread_for(r,MFlvol,{
+  for(int r=0;r<MFlvol;r++){
    lsSum[r]=scalar_type(0.0);
-  });
+  }

  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
@ -174,7 +213,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,

  // potentially wasting cores here if local time extent too small
  if (t_kernel) *t_kernel = -usecond();
-  thread_for(r,rd,{
+  for(int r=0;r<rd;r++) {

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@ -213,10 +252,10 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	}
      }
    }
-  });
+  };

  // Sum across simd lanes in the plane, breaking out orthog dir.
-  thread_for(rt,rd,{
+  for(int rt=0;rt<rd;rt++){

    Coordinate icoor(Nd);
    ExtractBuffer<SpinMatrix_s> extracted(Nsimd);               
@ -241,7 +280,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,

      }
    }}}
-  });
+  }
  if (t_kernel) *t_kernel += usecond();
  assert(mat.dimension(0) == Nmom);
  assert(mat.dimension(1) == Ngamma);
@ -290,35 +329,112 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  if (t_gsum) *t_gsum += usecond();
 }

+const int A2Ablocking=8;
+template<typename vtype> using iVecSpinMatrix = iVector<iMatrix<iScalar<vtype>, Ns>, A2Ablocking>;
+typedef iVecSpinMatrix<Complex  >             VecSpinMatrix;
+typedef iVecSpinMatrix<vComplex >             vVecSpinMatrix;
+typedef Lattice<vVecSpinMatrix>               LatticeVecSpinMatrix;

-///////////////////////////////////////////////////////////////////
-//Meson 
-// Interested in
-//
-//      sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
-//
-// Conventional meson field:
-//                 
-//    = sum_x,y Trace[ sum_j G |v_j(y,ty)> <w_j(x,tx)|  G sum_i |v_i(x,tx) ><w_i(y,ty)| ]
-//    = sum_ij sum_x,y < w_j(x,tx)| G |v_i(x,tx) > <w_i(y,ty) (x)|G| v_j(y,ty) >
-//    = sum_ij PI_ji(tx) PI_ij(ty)
-//
-// G5-Hermiticity
-//
-//      sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
-//    = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ]
-//    = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> <w_j(x,tx)|  G g5 sum_i   (|v_j(y,ty)> <w_i(x,tx)|)^dag ]      --  (*)
-//
-// NB:  Dag applies to internal indices spin,colour,complex
-//
-//    = sum_ij sum_x,y Trace[ g5 G |v_j(y,ty)> <w_j(x,tx)|  G g5  |w_i(x,tx)> <v_i(y,ty)| ]
-//    = sum_ij sum_x,y <v_i(y,ty)|g5 G |v_j(y,ty)> <w_j(x,tx)|  G g5 |w_i(x,tx)> 
-//    = sum_ij  PionVV(ty) PionWW(tx)
-//
-// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker
-//     expectation value. Otherwise biased.
-////////////////////////////////////////////////////////////////////
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::MesonFieldGPU(TensorType &mat, 
+				    const FermionField *lhs_wi,
+				    const FermionField *rhs_vj,
+				    std::vector<Gamma::Algebra> gammas,
+				    const std::vector<ComplexField > &mom,
+				    int orthogdim, double *t_kernel, double *t_gsum) 
+{
+  const int block=A2Ablocking;
+  typedef typename FImpl::SiteSpinor vobj;

+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Lblock = mat.dimension(3); 
+  int Rblock = mat.dimension(4);
+
+  //  assert(Lblock % block==0);
+  //  assert(Rblock % block==0);
+  
+  GridBase *grid = lhs_wi[0].Grid();
+  
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+  int Ngamma = gammas.size();
+  int Nmom   = mom.size();
+
+
+  LatticeVecSpinMatrix SpinMat(grid);
+  LatticeVecSpinMatrix MomSpinMat(grid);
+  
+  RealD t_afor = 0.0;
+  RealD t_sum  = 0.0;
+  RealD t_pha  = 0.0;
+  RealD t_trace= 0.0;
+  uint64_t ncall=0;
+  
+  std::vector<VecSpinMatrix> sliced;
+  for(int i=0;i<Lblock;i++){
+    autoView(SpinMat_v,SpinMat,AcceleratorWrite);
+    autoView(lhs_v,lhs_wi[i],AcceleratorRead);
+    for(int jo=0;jo<Rblock;jo+=block){
+      for(int j=jo;j<MIN(Rblock,jo+block);j++){
+	int jj=j%block;
+	autoView(rhs_v,rhs_vj[j],AcceleratorRead); // Create a vector of views
+	//////////////////////////////////////////
+	// Should write a SpinOuterColorTrace
+	//////////////////////////////////////////
+	t_afor-=usecond();
+	accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
+	    auto left = conjugate(lhs_v(ss));
+	    auto right = rhs_v(ss);
+	    auto vv =  SpinMat_v(ss);
+	    for(int s1=0;s1<Ns;s1++){
+	      for(int s2=0;s2<Ns;s2++){
+		vv(jj)(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+		  +               left()(s2)(1) * right()(s1)(1)
+		  +               left()(s2)(2) * right()(s1)(2);
+	      }}
+	    coalescedWrite(SpinMat_v[ss],vv);
+	  });
+	t_afor+=usecond();
+      }// j within block
+      // After getting the sitewise product do the mom phase loop
+      for(int m=0;m<Nmom;m++){
+	t_pha-=usecond();
+	MomSpinMat   = SpinMat * mom[m];
+	t_pha+=usecond();
+	t_sum-=usecond();
+	ncall++;
+	sliceSum(MomSpinMat,sliced,orthogdim);
+	t_sum+=usecond();
+	t_trace-=usecond();
+	for(int mu=0;mu<Ngamma;mu++){
+	  for(int t=0;t<sliced.size();t++){
+	    for(int j=jo;j<MIN(Rblock,jo+block);j++){
+	      int jj=j%block;
+	      auto tmp = sliced[t](jj);
+	      auto trSG = trace(tmp*Gamma(gammas[mu]));
+	      mat(m,mu,t,i,j) = trSG()();
+	    }
+	  }
+	}
+	t_trace+=usecond();
+      }
+    }//jo
+  }
+  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_afor  "<<t_afor<<" us"<<std::endl;
+  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_pha   "<<t_pha<<" us"<<std::endl;
+  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_sum   "<<t_sum<<" us"<<std::endl;
+  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU N_sum   "<<ncall<<" calls"<<std::endl;
+  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_trace "<<t_trace<<" us"<<std::endl;
+}
+
+
+/*
 template<class FImpl>
 void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat, 
 				  const FermionField *wi,
@ -645,6 +761,7 @@ void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat,
  const int nog5=0;
  PionFieldXX(mat,vi,vj,orthogdim,nog5);
 }
+*/

 // "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
 //
@ -992,9 +1109,9 @@ typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::va
                         std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
                         void>::type
 A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
-				   const TensorType &WW_sd,
-				   const FermionField *vs,
-				   const FermionField *vd)
+			      const TensorType &WW_sd,
+			      const FermionField *vs,
+			      const FermionField *vd)
 {
  GridBase *grid = vs[0].Grid();

@ -1062,7 +1179,6 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
  }

  for (int t = 0; t < N_t; t++){
-    std::cout << GridLogMessage << "Contraction t = " << t << std::endl;
    buf = WW_sd[t];
    thread_for(ss,grid->oSites(),{
      for(int d_o=0;d_o<N_d;d_o+=d_unroll){
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -132,27 +132,17 @@ inline void cuda_mem(void)

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
-    int nt=acceleratorThreads();					\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator					\
-      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
-    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
-  }
-#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
-  {									\
-    int nt=acceleratorThreads();					\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator					\
-      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
-    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
+    if ( num1*num2 ) {							\
+      int nt=acceleratorThreads();					\
+      typedef uint64_t Iterator;					\
+      auto lambda = [=] accelerator					\
+	(Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
+		      __VA_ARGS__;					\
+		    };							\
+      dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
+      dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
+      LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
+    }									\
  }

 #define accelerator_for6dNB(iter1, num1,				\
@ -175,19 +165,6 @@ inline void cuda_mem(void)
  }


-#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
-  {									\
-    int nt=acceleratorThreads();					\
-    typedef uint64_t Iterator;						\
-    auto lambda = [=] accelerator					\
-      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
-      __VA_ARGS__;							\
-    };									\
-    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
-    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
-    LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\
-  }
-
 template<typename lambda>  __global__
 void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
 {
@ -199,17 +176,6 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
    Lambda(x,y,z);
  }
 }
-template<typename lambda>  __global__
-void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
-{
-  // Weird permute is to make lane coalesce for large blocks
-  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
-  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
-  uint64_t z = threadIdx.x;
-  if ( (x < num1) && (y<num2) && (z<num3) ) {
-    Lambda(x,y,z);
-  }
-}

 template<typename lambda>  __global__
 void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
@ -523,9 +489,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
 #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
 // FIXME -- the non-blocking nature got broken March 30 2023 by PAB
 #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );  
-#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
-  prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
-  accelerator_barrier(dummy);

 #define accelerator_for( iter, num, nsimd, ... )		\
  accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } );	\
--- a/systems/Linux-cuda/config-command
+++ b/systems/Linux-cuda/config-command
@ -0,0 +1,16 @@
+../../configure \
+    --enable-comms=mpi \
+    --enable-simd=GPU \
+    --enable-gen-simd-width=64 \
+    --enable-shm=nvlink \
+    --with-lime=$CLIME \
+    --with-gmp=$GMP \
+    --with-mpfr=$MPFR \
+    --enable-accelerator=cuda \
+    --disable-gparity \
+    --disable-fermion-reps \
+    --disable-unified \
+    CXX=nvcc \
+    LDFLAGS="-cudart shared -L$NVIDIALIB -lcublas" \
+    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared"
+
--- a/systems/Linux-cuda/sourceme.sh
+++ b/systems/Linux-cuda/sourceme.sh
@ -0,0 +1,12 @@
+. /home/paboyle/spack/share/spack/setup-env.sh
+spack load cuda@12.0.0
+spack load c-lime
+spack load gmp
+spack load mpfr
+spack load openmpi
+export CUDA=`spack find --paths cuda@11.8.0 | grep cuda | cut -c 14-`
+export CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
+export GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
+export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
+export NVIDIALIB=$CUDA/targets/x86_64-linux/lib/
+export LD_LIBRARY_PATH=$NVIDIALIB:$LD_LIBRARY_PATH
--- a/systems/Lumi/config-command
+++ b/systems/Lumi/config-command
@ -1,7 +1,7 @@
 spack load c-lime
 spack load gmp
 spack load mpfr
-CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
+CLIME=`spack find --paths c-lime | grep c-lime| cut -c 13-`
 GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
 MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
 echo clime X$CLIME
--- a/tests/Test_meson_field.cc
+++ b/tests/Test_meson_field.cc
@ -56,13 +56,9 @@ int main(int argc, char *argv[])

  // MesonField lhs and rhs vectors
  std::vector<FermionField> phi(VDIM,&grid);
-  std::vector<FermionField> rho(VDIM,&grid);
-  FermionField rho_tmp(&grid);
  std::cout << GridLogMessage << "Initialising random meson fields" << std::endl;
  for (unsigned int i = 0; i < VDIM; ++i){
    random(pRNG,phi[i]);
-    random(pRNG,rho_tmp); //ideally only nonzero on t=0
-    rho[i] = where((t==TSRC), rho_tmp, 0.*rho_tmp); //ideally only nonzero on t=0
  }
  std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl;

@ -82,7 +78,7 @@ int main(int argc, char *argv[])
 	  {1.,1.,1.},
 	  {2.,0.,0.}
  };
-
+  // 5 momenta x VDIMxVDIM = 125 calls (x 16 spins) 1.4s => 1400/125 ~10ms per call
  std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl;

  std::cout << GridLogMessage << "Computing complex phases" << std::endl;
@ -102,28 +98,47 @@ int main(int argc, char *argv[])
  std::cout << GridLogMessage << "Computing complex phases done." << std::endl;

  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
-  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
-  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mrr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
+  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp_gpu(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);

  // timer
  double start,stop;

  //execute meson field routine
+  std::cout << GridLogMessage << "Meson Field Warmup Begin" << std::endl;
+  A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
+  std::cout << GridLogMessage << "Meson Field Timing Begin" << std::endl;
  start = usecond();
  A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
  stop = usecond();
  std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl;
-  start = usecond();
-  /* Ideally, for this meson field we could pass TSRC (even better a list of timeslices)
-   * to the routine so that all the compnents which are predictably equal to zero are not computed. */
-  A2Autils<WilsonImplR>::MesonField(Mpr,&phi[0],&rho[0],Gmu,phases,Tp);
-  stop = usecond();
-  std::cout << GridLogMessage << "M(phi,rho) created, execution time " << stop-start << " us" << std::endl;
-  start = usecond();
-  A2Autils<WilsonImplR>::MesonField(Mrr,&rho[0],&rho[0],Gmu,phases,Tp);
-  stop = usecond();
-  std::cout << GridLogMessage << "M(rho,rho) created, execution time " << stop-start << " us" << std::endl;

+  std::cout << GridLogMessage << "Meson Field GPU Warmup Begin" << std::endl;
+  A2Autils<WilsonImplR>::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp);
+  std::cout << GridLogMessage << "Meson Field GPU Timing Begin" << std::endl;
+  start = usecond();
+  A2Autils<WilsonImplR>::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp);
+  stop = usecond();
+  std::cout << GridLogMessage << "M_gpu(phi,phi) created, execution time " << stop-start << " us" << std::endl;
+
+  for(int mom=0;mom<momenta.size();mom++){
+    for(int mu=0;mu<Gmu.size();mu++){
+      for(int t=0;t<Nt;t++){
+	for(int v=0;v<VDIM;v++){
+	  for(int w=0;w<VDIM;w++){
+	    std::cout << GridLogMessage
+		      << " " << mom
+		      << " " << mu
+		      << " " << t
+		      << " " << v
+		      << " " << w
+		      << " " << Mpp_gpu(mom,mu,t,v,w)
+		      << " " << Mpp(mom,mu,t,v,w) << std::endl;
+	  }
+	}
+      }
+    }
+  }
+  
  std::string FileName = "Meson_Fields";
 #ifdef HAVE_HDF5
  using Default_Reader = Grid::Hdf5Reader;
@ -134,12 +149,11 @@ int main(int argc, char *argv[])
  using Default_Writer = Grid::BinaryWriter;
  FileName.append(".bin");
 #endif
-
-  Default_Writer w(FileName);
-  write(w,"phi_phi",Mpp);
-  write(w,"phi_rho",Mpr);
-  write(w,"rho_rho",Mrr);
-
+  {
+    Default_Writer w(FileName);
+    write(w,"phi_phi",Mpp);
+    write(w,"phi_phi_gpu",Mpp_gpu);
+  }
  // epilogue
  std::cout << GridLogMessage << "Grid is finalizing now" << std::endl;
  Grid_finalize();
Author	SHA1	Message	Date
Peter Boyle	4ed2c2c74f	Config command	2024-10-18 13:58:33 -04:00
Peter Boyle	955da582b6	Working on NVCC	2024-10-18 13:58:03 -04:00
Peter Boyle	11b07b950d	Vanilla linux compile, assuming spack prerequisites	2024-10-18 13:57:40 -04:00
Peter Boyle	8f70cfeda9	Clean up	2024-10-18 13:56:53 -04:00
Peter Boyle	ce64271048	Remove the copying version	2024-10-18 13:56:24 -04:00
Peter Boyle	5cc4f3241d	Meson field test	2024-10-18 15:42:30 +00:00