Merge pull request #457 from lehner/feature/gpt

Import GPT-related updates
2025-12-12 16:54:41 +00:00 · 2024-02-28 13:59:04 -05:00
parent 73c0b29535 9f89486df5
commit 2e570f5300
7 changed files with 50 additions and 25 deletions
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }

-#if ( (!defined(GRID_CUDA)) )
+#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -469,15 +469,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;

-  vobj zz = Zero();
-  
  accelerator_for(sc,coarse->oSites(),1,{

      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate

-      vobj cd = zz;
+      vobj cd = Zero();
      
      for(int sb=0;sb<blockVol;sb++){

--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -45,6 +45,7 @@ public:
  };
  // Host only
  GridBase * getGrid(void) const { return _grid; };
+  vobj* getHostPointer(void) const { return _odata; };
 };

 /////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.h
@@ -63,6 +63,8 @@ public:
  virtual void MooeeDag(const FermionField &in, FermionField &out) ;
  virtual void MooeeInv(const FermionField &in, FermionField &out) ;
  virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
+  virtual void M(const FermionField &in, FermionField &out) ;
+  virtual void Mdag(const FermionField &in, FermionField &out) ;
  
 private:
  RealD mu; // TwistedMass parameter
--- a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h
@@ -93,5 +93,25 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou
  RealD b    = tm /sq;
  axpibg5x(out,in,a,b);
 }
+template<class Impl>
+void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  this->Dhop(in, out, DaggerNo);
+  FermionField tmp(out.Grid());
+  RealD a = 4.0+this->mass;
+  RealD b = this->mu;
+  axpibg5x(tmp,in,a,b);
+  axpy(out, 1.0, tmp, out);
+}
+template<class Impl>
+void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
+  out.Checkerboard() = in.Checkerboard();
+  this->Dhop(in, out, DaggerYes);
+  FermionField tmp(out.Grid());
+  RealD a = 4.0+this->mass;
+  RealD b = -this->mu;
+  axpibg5x(tmp,in,a,b);
+  axpy(out, 1.0, tmp, out);
+}

 NAMESPACE_END(Grid);
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -706,7 +706,7 @@ public:
 	}
      }
    }
-    std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
+    //std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
@@ -761,7 +761,8 @@ public:
 		   int checkerboard,
 		   const std::vector<int> &directions,
 		   const std::vector<int> &distances,
-		   Parameters p=Parameters())
+		   Parameters p=Parameters(),
+		   bool preserve_shm=false)
  {
    face_table_computed=0;
    _grid    = grid;
@@ -855,6 +856,8 @@ public:
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();

+    // Allow for multiple stencils to exist simultaneously
+    if (!preserve_shm)
      _grid->ShmBufferFreeAll();

    int maxl=2;
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -288,11 +288,12 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
    unsigned long nt=acceleratorThreads();				\
+    if(nt < 8)nt=8;							\
    unsigned long unum1 = num1;						\
    unsigned long unum2 = num2;						\
-      if(nt < 8)nt=8;							\
+    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
    cl::sycl::range<3> local {nt,1,nsimd};				\
-      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
+    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
    cgh.parallel_for(							\
 		     cl::sycl::nd_range<3>(global,local),		\
 		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
@@ -301,7 +302,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 		       auto iter1    = item.get_global_id(0);		\
 		       auto iter2    = item.get_global_id(1);		\
 		       auto lane     = item.get_global_id(2);		\
-      { __VA_ARGS__ };				      \
+		       { if (iter1 < unum1){ __VA_ARGS__ } };		\
 		     });						\
  });