Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc

Aim to reduce the amount of cuda and other code variations floating around all over the place. Will move GpuInit iinto Accelerator.cc from Init.cc Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
2025-07-13 03:27:07 +01:00 · 2020-05-08 06:23:55 -07:00
parent 28a1fcaaff
commit f8b8e00090
13 changed files with 557 additions and 718 deletions
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
@ -1,14 +1,3 @@
-    // blockZaxpy in bockPromote - 3s, 5%
-    // noncoalesced linalg in Preconditionoer ~ 3s 5%
-    // Lancos tuning or replace 10-20s ~ 25%, open ended
-    // setup tuning   5s  ~  8%
-    //    -- e.g. ordermin, orderstep tunables.
-    // MdagM path without norm in LinOp code.     few seconds
-
-    // Mdir calc blocking kernels
-    // Fuse kernels in blockMaskedInnerProduct
-    // preallocate Vectors in Cayley 5D ~ few percent few seconds
-
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@ -91,34 +80,7 @@ public:
    }
    directions   [2*_d]=0;
    displacements[2*_d]=0;
-      
-    //// report back
-    std::cout<<GridLogMessage<<"directions    :";
-    for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
-    std::cout<<std::endl;
-    std::cout<<GridLogMessage<<"displacements :";
-    for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
-    std::cout<<std::endl;
  }
-  
-  /*
-  // Original cleaner code
-  Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
-  for(int d=0;d<dimension;d++){
-  directions[2*d  ] = d;
-  directions[2*d+1] = d;
-  displacements[2*d  ] = +1;
-  displacements[2*d+1] = -1;
-  }
-  directions   [2*dimension]=0;
-  displacements[2*dimension]=0;
-  }
-  std::vector<int> GetDelta(int point) {
-  std::vector<int> delta(dimension,0);
-  delta[directions[point]] = displacements[point];
-  return delta;
-  };
-  */    

 };
  
@ -149,25 +111,7 @@ public:
    CoarseScalar InnerProd(CoarseGrid); 
    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
-    //    std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
-    //    blockOrthogonalise(InnerProd,subspace);
-    //      std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
-    //      CheckOrthogonal();
  } 
-  void CheckOrthogonal(void){
-    CoarseVector iProj(CoarseGrid); 
-    CoarseVector eProj(CoarseGrid); 
-    for(int i=0;i<nbasis;i++){
-      blockProject(iProj,subspace[i],subspace);
-      eProj=Zero(); 
-      accelerator_for(ss, CoarseGrid->oSites(),1,{
-	eProj[ss](i)=CComplex(1.0);
-      });
-      eProj=eProj - iProj;
-      std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
-    }
-    std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
-  }
  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
    blockProject(CoarseVec,FineVec,subspace);
  }
@ -175,50 +119,12 @@ public:
    FineVec.Checkerboard() = subspace[0].Checkerboard();
    blockPromote(CoarseVec,FineVec,subspace);
  }
-  void CreateSubspaceRandom(GridParallelRNG &RNG){
-    for(int i=0;i<nbasis;i++){
-      random(RNG,subspace[i]);
-    }
-  }
-
-  virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
-
-    RealD scale;
-
-    ConjugateGradient<FineField> CG(1.0e-2,100,false);
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-
-    for(int b=0;b<nn;b++){
-	
-      subspace[b] = Zero();
-      gaussian(RNG,noise);
-      scale = std::pow(norm2(noise),-0.5); 
-      noise=noise*scale;
-	
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-      for(int i=0;i<1;i++){
-
-	CG(hermop,noise,subspace[b]);
-
-	noise = subspace[b];
-	scale = std::pow(norm2(noise),-0.5); 
-	noise=noise*scale;
-
-      }
-
-      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
-      subspace[b]   = noise;
-
-    }
-  }

  ////////////////////////////////////////////////////////////////////////////////////////////////
  // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
  // and this is the best I found
  ////////////////////////////////////////////////////////////////////////////////////////////////
-#if 1
+
  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
 				       int nn,
 				       double hi,
@ -313,201 +219,6 @@ public:
    }
    assert(b==nn);
  }
-#endif
-#if 0
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter,
-				       int ordermin,
-				       int orderstep,
-				       double filterlo
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-    FineField combined(FineGrid);
-
-    // New normalised noise
-    gaussian(RNG,noise);
-    scale = std::pow(norm2(noise),-0.5); 
-    noise=noise*scale;
-
-    // Initial matrix element
-    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-    int b =0;
-#define FILTERb(llo,hhi,oorder)						\
-    {									\
-      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
-      Cheb(hermop,noise,Mn);						\
-      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
-      subspace[b]   = Mn;						\
-      hermop.Op(Mn,tmp);						\
-      std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
-      b++;								\
-    }									
-
-    //      JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5);	\
-
-    RealD alpha=-0.8;
-    RealD beta =-0.8;
-#define FILTER(llo,hhi,oorder)						\
-    {									\
-      Chebyshev<FineField> Cheb(llo,hhi,oorder);			\
-      /* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
-      Cheb(hermop,noise,Mn);						\
-      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;			\
-      subspace[b]   = Mn;						\
-      hermop.Op(Mn,tmp);						\
-      std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
-      b++;								\
-    }									
-    
-#define FILTERc(llo,hhi,oorder)				\
-    {							\
-      Chebyshev<FineField> Cheb(llo,hhi,oorder);	\
-      Cheb(hermop,noise,combined);			\
-    }									
-
-    double node = 0.000;
-    FILTERb(lo,hi,orderfilter);// 0
-    //    FILTERc(node,hi,51);// 0
-    noise = Mn;
-    int base = 0;
-    int mult = 100;
-    FILTER(node,hi,base+1*mult);
-    FILTER(node,hi,base+2*mult);
-    FILTER(node,hi,base+3*mult);
-    FILTER(node,hi,base+4*mult);
-    FILTER(node,hi,base+5*mult);
-    FILTER(node,hi,base+6*mult);
-    FILTER(node,hi,base+7*mult);
-    FILTER(node,hi,base+8*mult);
-    FILTER(node,hi,base+9*mult);
-    FILTER(node,hi,base+10*mult);
-    FILTER(node,hi,base+11*mult);
-    FILTER(node,hi,base+12*mult);
-    FILTER(node,hi,base+13*mult);
-    FILTER(node,hi,base+14*mult);
-    FILTER(node,hi,base+15*mult);
-    assert(b==nn);
-  }
-#endif
-
-#if 0
-  virtual void CreateSubspaceChebyshev(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
-				       int nn,
-				       double hi,
-				       double lo,
-				       int orderfilter,
-				       int ordermin,
-				       int orderstep,
-				       double filterlo
-				       ) {
-
-    RealD scale;
-
-    FineField noise(FineGrid);
-    FineField Mn(FineGrid);
-    FineField tmp(FineGrid);
-    FineField combined(FineGrid);
-
-    // New normalised noise
-    gaussian(RNG,noise);
-    scale = std::pow(norm2(noise),-0.5); 
-    noise=noise*scale;
-
-    // Initial matrix element
-    hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
-
-    int b =0;
-    {						
-      Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
-      //      JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
-      //JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
-      //      JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
-      JacobiPoly(hermop,noise,Mn);
-      scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
-      subspace[b]   = Mn;
-      hermop.Op(Mn,tmp);
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; 
-      b++;
-      //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
-      //      subspace[b]   = tmp;      b++;
-      //    }									
-    }									
-
-#define FILTER(lambda)						\
-    {								\
-      hermop.HermOp(subspace[0],tmp);				\
-      tmp = tmp - lambda *subspace[0];				\
-      scale = std::pow(norm2(tmp),-0.5);			\
-      tmp=tmp*scale;							\
-      subspace[b]   = tmp;						\
-      hermop.Op(subspace[b],tmp);					\
-      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
-      b++;								\
-    }									
-    //      scale = std::pow(norm2(tmp),-0.5);     tmp=tmp*scale;
-    //      subspace[b]   = tmp;      b++;
-    //    }									
-
-    FILTER(2.0e-5);
-    FILTER(2.0e-4);
-    FILTER(4.0e-4);
-    FILTER(8.0e-4);
-    FILTER(8.0e-4);
-
-    FILTER(2.0e-3);
-    FILTER(3.0e-3);
-    FILTER(4.0e-3);
-    FILTER(5.0e-3);
-    FILTER(6.0e-3);
-
-    FILTER(2.5e-3);
-    FILTER(3.5e-3);
-    FILTER(4.5e-3);
-    FILTER(5.5e-3);
-    FILTER(6.5e-3);
-
-    //    FILTER(6.0e-5);//6
-    //    FILTER(7.0e-5);//8
-    //    FILTER(8.0e-5);//9
-    //    FILTER(9.0e-5);//3
-
-    /*
-    //    FILTER(1.0e-4);//10
-    FILTER(2.0e-4);//11
-    //   FILTER(3.0e-4);//12
-    //    FILTER(4.0e-4);//13
-    FILTER(5.0e-4);//14
-
-    FILTER(6.0e-3);//4
-    FILTER(7.0e-4);//1
-    FILTER(8.0e-4);//7
-    FILTER(9.0e-4);//15
-    FILTER(1.0e-3);//2
-
-    FILTER(2.0e-3);//2
-    FILTER(3.0e-3);//2
-    FILTER(4.0e-3);//2
-    FILTER(5.0e-3);//2
-    FILTER(6.0e-3);//2
-
-    FILTER(7.0e-3);//2
-    FILTER(8.0e-3);//2
-    FILTER(1.0e-2);//2
-    */
-    std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
-    assert(b==nn);
-  }
-#endif
-

 };

@ -580,23 +291,22 @@ public:
      int ptype;
      StencilEntry *SE;

-      int lane=SIMTlane(Nsimd);
      for(int point=0;point<geom.npoint;point++){

 	SE=Stencil.GetEntry(ptype,point,ss);
 	  
 	if(SE->_is_local) { 
-	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
 	} else {
-	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
 	}
-	synchronise();
+	acceleratorSynchronise();

 	for(int bb=0;bb<nbasis;bb++) {
 	  res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
 	}
      }
-      coalescedWrite(out_v[ss](b),res,lane);
+      coalescedWrite(out_v[ss](b),res);
    });
    usecs +=usecond();

@ -604,13 +314,6 @@ public:
    RealD Nout= norm2(out);
    nrm_usec+=usecond();

-    /*
-        std::cout << GridLogMessage << "\tNorm        " << nrm_usec << " us" <<std::endl;
-        std::cout << GridLogMessage << "\tHalo        " << comms_usec << " us" <<std::endl;
-        std::cout << GridLogMessage << "\tMatrix      " << usecs << " us" <<std::endl;
-        std::cout << GridLogMessage << "\t  mflop/s   " << flops/usecs<<std::endl;
-        std::cout << GridLogMessage << "\t  MB/s      " << bytes/usecs<<std::endl;
-    */
    return Nout;
  };

@ -658,45 +361,20 @@ public:
      int ptype;
      StencilEntry *SE;

-      int lane=SIMTlane(Nsimd);
      SE=Stencil.GetEntry(ptype,point,ss);
 	  
      if(SE->_is_local) { 
-	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
+	nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
      } else {
-	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
+	nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
      }
-      synchronise();
+      acceleratorSynchronise();

      for(int bb=0;bb<nbasis;bb++) {
 	res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
      }
-      coalescedWrite(out_v[ss](b),res,lane);
+      coalescedWrite(out_v[ss](b),res);
    });
-#if 0
-    accelerator_for(ss,Grid()->oSites(),1,{
-
-      siteVector res = Zero();
-      siteVector nbr;
-      int ptype;
-      StencilEntry *SE;
-      
-      SE=Stencil.GetEntry(ptype,point,ss);
-      
-      if(SE->_is_local&&SE->_permute) {
-	permute(nbr,in_v[SE->_offset],ptype);
-      } else if(SE->_is_local) {
-	nbr = in_v[SE->_offset];
-      } else {
-	nbr = Stencil.CommBuf()[SE->_offset];
-      }
-      synchronise();
-
-      res = res + Aview_p[point][ss]*nbr;
-      
-      out_v[ss]=res;
-    });
-#endif
  }
  void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
  {
@ -912,33 +590,8 @@ public:
      std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
      ForceHermitian();
    }
-      // AssertHermitian();
-      // ForceDiagonal();
  }

-#if 0
-    ///////////////////////////
-    // test code worth preserving in if block
-    ///////////////////////////
-    std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
-    for(int p=0;p<geom.npoint;p++){
-      std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
-      std::cout<<GridLogMessage<< A[p] << std::endl;
-    }
-    std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
-
-    phi=Subspace.subspace[0];
-    std::vector<int> bc(FineGrid->_ndimension,0);
-
-    blockPick(Grid(),phi,tmp,bc);      // Pick out a block
-    linop.Op(tmp,Mphi);                // Apply big dop
-    blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
-    std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
-    std::cout<<GridLogMessage<< iProj <<std::endl;
-    std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
-#endif
-
-
  void ForceHermitian(void) {
    CoarseMatrix Diff  (Grid());
    for(int p=0;p<geom.npoint;p++){
@ -958,27 +611,6 @@ public:
      }
    }
  }
-  void AssertHermitian(void) {
-    CoarseMatrix AA    (Grid());
-    CoarseMatrix AAc   (Grid());
-    CoarseMatrix Diff  (Grid());
-    for(int d=0;d<4;d++){
-	
-      int dd=d+1;
-      AAc = Cshift(A[2*d+1],dd,1);
-      AA  = A[2*d];
-	
-      Diff = AA - adj(AAc);
-
-      std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
-      std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
-	  
-    }
-    Diff = A[8] - adj(A[8]);
-    std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
-    std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
-  }
-    
 };

 NAMESPACE_END(Grid);