mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Compare commits
	
		
			17 Commits
		
	
	
		
			882a217074
			...
			e652fc2825
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					e652fc2825 | ||
| 
						 | 
					a49fa3f8d0 | ||
| 
						 | 
					cd452a2f91 | ||
| 
						 | 
					4f89f603ae | ||
| 
						 | 
					11dc2c5e1d | ||
| 
						 | 
					6fec3c15ca | ||
| 
						 | 
					938c47480f | ||
| 
						 | 
					3811d19298 | ||
| 
						 | 
					83a3ab6b6f | ||
| 
						 | 
					d66a9af6a3 | ||
| 
						 | 
					adc90d3a86 | ||
| 
						 | 
					ebbd015c5c | ||
| 
						 | 
					4ab73b36b2 | ||
| 
						 | 
					130e07a422 | ||
| 
						 | 
					8f47bb367e | ||
| 
						 | 
					0c3cb60135 | ||
| 
						 | 
					9eae8fca5d | 
@@ -277,6 +277,38 @@ public:
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
template<class Matrix,class Field>
 | 
			
		||||
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
 | 
			
		||||
  Matrix &_Mat;
 | 
			
		||||
  RealD shift;
 | 
			
		||||
public:
 | 
			
		||||
  ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
 | 
			
		||||
  // Support for coarsening to a multigrid
 | 
			
		||||
  void OpDiag (const Field &in, Field &out) {
 | 
			
		||||
    _Mat.Mdiag(in,out);
 | 
			
		||||
    out = out + shift*in;
 | 
			
		||||
  }
 | 
			
		||||
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
 | 
			
		||||
    _Mat.Mdir(in,out,dir,disp);
 | 
			
		||||
  }
 | 
			
		||||
  void OpDirAll  (const Field &in, std::vector<Field> &out){
 | 
			
		||||
    _Mat.MdirAll(in,out);
 | 
			
		||||
  };
 | 
			
		||||
  void Op     (const Field &in, Field &out){
 | 
			
		||||
    _Mat.M(in,out);
 | 
			
		||||
    out = out + shift * in;
 | 
			
		||||
  }
 | 
			
		||||
  void AdjOp     (const Field &in, Field &out){
 | 
			
		||||
    _Mat.Mdag(in,out);
 | 
			
		||||
    out = out + shift * in;
 | 
			
		||||
  }
 | 
			
		||||
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
  void HermOp(const Field &in, Field &out){
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////
 | 
			
		||||
// Even Odd Schur decomp operators; there are several
 | 
			
		||||
 
 | 
			
		||||
@@ -97,7 +97,7 @@ public:
 | 
			
		||||
 | 
			
		||||
    RealD scale;
 | 
			
		||||
 | 
			
		||||
    ConjugateGradient<FineField> CG(1.0e-2,100,false);
 | 
			
		||||
    ConjugateGradient<FineField> CG(1.0e-3,400,false);
 | 
			
		||||
    FineField noise(FineGrid);
 | 
			
		||||
    FineField Mn(FineGrid);
 | 
			
		||||
 | 
			
		||||
@@ -110,7 +110,7 @@ public:
 | 
			
		||||
      
 | 
			
		||||
      hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<1;i++){
 | 
			
		||||
      for(int i=0;i<4;i++){
 | 
			
		||||
 | 
			
		||||
	CG(hermop,noise,subspace[b]);
 | 
			
		||||
 | 
			
		||||
@@ -146,7 +146,7 @@ public:
 | 
			
		||||
      
 | 
			
		||||
      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<3;i++){
 | 
			
		||||
      for(int i=0;i<2;i++){
 | 
			
		||||
	//  void operator() (const Field &src, Field &psi){
 | 
			
		||||
#if 1
 | 
			
		||||
	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -441,8 +441,20 @@ public:
 | 
			
		||||
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Galerkin projection of matrix
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 | 
			
		||||
		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
 | 
			
		||||
  {
 | 
			
		||||
    CoarsenOperator(linop,Subspace,Subspace);
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Petrov - Galerkin projection of matrix
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 | 
			
		||||
		       Aggregation<Fobj,CComplex,nbasis> & U,
 | 
			
		||||
		       Aggregation<Fobj,CComplex,nbasis> & V)
 | 
			
		||||
  {
 | 
			
		||||
    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
 | 
			
		||||
    GridBase *grid = FineGrid();
 | 
			
		||||
@@ -458,11 +470,9 @@ public:
 | 
			
		||||
    // Orthogonalise the subblocks over the basis
 | 
			
		||||
    /////////////////////////////////////////////////////////////
 | 
			
		||||
    CoarseScalar InnerProd(CoarseGrid()); 
 | 
			
		||||
    blockOrthogonalise(InnerProd,Subspace.subspace);
 | 
			
		||||
    blockOrthogonalise(InnerProd,V.subspace);
 | 
			
		||||
    blockOrthogonalise(InnerProd,U.subspace);
 | 
			
		||||
 | 
			
		||||
    //    for(int s=0;s<Subspace.subspace.size();s++){
 | 
			
		||||
      //      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
 | 
			
		||||
    //    }
 | 
			
		||||
    const int npoint = geom.npoint;
 | 
			
		||||
      
 | 
			
		||||
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
 | 
			
		||||
@@ -542,7 +552,7 @@ public:
 | 
			
		||||
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
 | 
			
		||||
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 | 
			
		||||
	tphaseBZ-=usecond();
 | 
			
		||||
	phaV = phaF[p]*Subspace.subspace[i];
 | 
			
		||||
	phaV = phaF[p]*V.subspace[i];
 | 
			
		||||
	tphaseBZ+=usecond();
 | 
			
		||||
 | 
			
		||||
	/////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -555,7 +565,7 @@ public:
 | 
			
		||||
	//	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
 | 
			
		||||
 | 
			
		||||
	tproj-=usecond();
 | 
			
		||||
	blockProject(coarseInner,MphaV,Subspace.subspace);
 | 
			
		||||
	blockProject(coarseInner,MphaV,U.subspace);
 | 
			
		||||
	coarseInner = conjugate(pha[p]) * coarseInner;
 | 
			
		||||
 | 
			
		||||
	ComputeProj[p] = coarseInner;
 | 
			
		||||
 
 | 
			
		||||
@@ -438,8 +438,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
			
		||||
      list.push_back(rrq);
 | 
			
		||||
      off_node_bytes+=rbytes;
 | 
			
		||||
    }
 | 
			
		||||
#ifdef NVLINK_GET
 | 
			
		||||
    else { 
 | 
			
		||||
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
 | 
			
		||||
      assert(shm!=NULL);
 | 
			
		||||
      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // This is a NVLINK PUT  
 | 
			
		||||
  if (dox) {
 | 
			
		||||
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
 | 
			
		||||
      tag= dir+_processor*32;
 | 
			
		||||
@@ -448,9 +455,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
			
		||||
      list.push_back(xrq);
 | 
			
		||||
      off_node_bytes+=xbytes;
 | 
			
		||||
    } else {
 | 
			
		||||
#ifndef NVLINK_GET
 | 
			
		||||
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
 | 
			
		||||
      assert(shm!=NULL);
 | 
			
		||||
      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  return off_node_bytes;
 | 
			
		||||
@@ -459,7 +468,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 | 
			
		||||
{
 | 
			
		||||
  int nreq=list.size();
 | 
			
		||||
 | 
			
		||||
  /*finishes Get/Put*/
 | 
			
		||||
  acceleratorCopySynchronise();
 | 
			
		||||
 | 
			
		||||
  if (nreq==0) return;
 | 
			
		||||
 
 | 
			
		||||
@@ -137,7 +137,7 @@ public:
 | 
			
		||||
  ///////////////////////////////////////////////////
 | 
			
		||||
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
 | 
			
		||||
  static void SharedMemoryFree(void);
 | 
			
		||||
  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
 | 
			
		||||
  //  static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
 | 
			
		||||
  static void SharedMemoryZero(void *dest,size_t bytes);
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 
 | 
			
		||||
@@ -547,7 +547,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
			
		||||
  HostCommBuf= acceleratorAllocHost(bytes);
 | 
			
		||||
#else 
 | 
			
		||||
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
 | 
			
		||||
#ifdef HAVE_NUMAIF_H
 | 
			
		||||
#if 0
 | 
			
		||||
  #warning "Moving host buffers to specific NUMA domain"
 | 
			
		||||
  int numa;
 | 
			
		||||
  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
 | 
			
		||||
@@ -916,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 | 
			
		||||
  bzero(dest,bytes);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
 | 
			
		||||
  acceleratorCopyToDevice(src,dest,bytes);
 | 
			
		||||
#else   
 | 
			
		||||
  bcopy(src,dest,bytes);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
 | 
			
		||||
//{
 | 
			
		||||
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
 | 
			
		||||
//  acceleratorCopyToDevice(src,dest,bytes);
 | 
			
		||||
//#else   
 | 
			
		||||
//  bcopy(src,dest,bytes);
 | 
			
		||||
//#endif
 | 
			
		||||
//}
 | 
			
		||||
////////////////////////////////////////////////////////
 | 
			
		||||
// Global shared functionality finished
 | 
			
		||||
// Now move to per communicator functionality
 | 
			
		||||
@@ -959,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 | 
			
		||||
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
 | 
			
		||||
 | 
			
		||||
    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
 | 
			
		||||
    //    std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  ShmBufferFreeAll();
 | 
			
		||||
 | 
			
		||||
@@ -989,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  //SharedMemoryTest();
 | 
			
		||||
  SharedMemoryTest();
 | 
			
		||||
}
 | 
			
		||||
//////////////////////////////////////////////////////////////////
 | 
			
		||||
// On node barrier
 | 
			
		||||
@@ -1011,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
 | 
			
		||||
       check[0]=GlobalSharedMemory::WorldNode;
 | 
			
		||||
       check[1]=r;
 | 
			
		||||
       check[2]=magic;
 | 
			
		||||
       GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
 | 
			
		||||
       acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  ShmBarrier();
 | 
			
		||||
  for(uint64_t r=0;r<ShmSize;r++){
 | 
			
		||||
    ShmBarrier();
 | 
			
		||||
    GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
 | 
			
		||||
    ShmBarrier();
 | 
			
		||||
    acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
 | 
			
		||||
    assert(check[0]==GlobalSharedMemory::WorldNode);
 | 
			
		||||
    assert(check[1]==r);
 | 
			
		||||
    assert(check[2]==magic);
 | 
			
		||||
    ShmBarrier();
 | 
			
		||||
  }
 | 
			
		||||
  ShmBarrier();
 | 
			
		||||
  std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void *SharedMemory::ShmBuffer(int rank)
 | 
			
		||||
 
 | 
			
		||||
@@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
 | 
			
		||||
  d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
 | 
			
		||||
  
 | 
			
		||||
  //copy offsets to device
 | 
			
		||||
  acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
 | 
			
		||||
  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
 | 
			
		||||
@@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
 | 
			
		||||
  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
 | 
			
		||||
  
 | 
			
		||||
  //sync after copy
 | 
			
		||||
  accelerator_barrier();
 | 
			
		||||
 
 | 
			
		||||
@@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 | 
			
		||||
  } else {							\
 | 
			
		||||
    chi = coalescedRead(buf[SE->_offset],lane);			\
 | 
			
		||||
  }								\
 | 
			
		||||
  acceleratorSynchronise();						\
 | 
			
		||||
  acceleratorSynchronise();					\
 | 
			
		||||
  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
 | 
			
		||||
  Recon(result, Uchi);
 | 
			
		||||
 | 
			
		||||
@@ -504,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 | 
			
		||||
    autoView(st_v , st,AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
   if( interior && exterior ) {
 | 
			
		||||
     //     acceleratorFenceComputeStream();
 | 
			
		||||
     acceleratorFenceComputeStream();
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
 
 | 
			
		||||
@@ -446,6 +446,7 @@ public:
 | 
			
		||||
    Communicate();
 | 
			
		||||
    CommsMergeSHM(compress);
 | 
			
		||||
    CommsMerge(compress);
 | 
			
		||||
    accelerator_barrier();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
 | 
			
		||||
@@ -689,6 +690,7 @@ public:
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    //    std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
 | 
			
		||||
    surface_list.resize(surface_list_size);
 | 
			
		||||
    std::vector<int> surface_list_host(surface_list_size);
 | 
			
		||||
    int32_t ss=0;
 | 
			
		||||
@@ -708,7 +710,7 @@ public:
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
 | 
			
		||||
    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
 | 
			
		||||
    //    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  /// Introduce a block structure and switch off comms on boundaries
 | 
			
		||||
  void DirichletBlock(const Coordinate &dirichlet_block)
 | 
			
		||||
@@ -800,8 +802,8 @@ public:
 | 
			
		||||
    this->_entries_host_p = &_entries[0];
 | 
			
		||||
    this->_entries_p = &_entries_device[0];
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
 | 
			
		||||
	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
 | 
			
		||||
    //    std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
 | 
			
		||||
    //	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    for(int ii=0;ii<npoints;ii++){
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -242,19 +242,33 @@ inline void *acceleratorAllocDevice(size_t bytes)
 | 
			
		||||
  return ptr;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
typedef int acceleratorEvent_t;
 | 
			
		||||
 | 
			
		||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
 | 
			
		||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
 | 
			
		||||
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
 | 
			
		||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
 | 
			
		||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 | 
			
		||||
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
 | 
			
		||||
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
 | 
			
		||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 | 
			
		||||
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
 | 
			
		||||
  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
 | 
			
		||||
  acceleratorCopyFromDevice(from,to,bytes);
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 | 
			
		||||
{
 | 
			
		||||
  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
 | 
			
		||||
inline void acceleratorEventWait(acceleratorEvent_t ev)
 | 
			
		||||
{
 | 
			
		||||
  //auto discard=cudaStreamSynchronize(ev);
 | 
			
		||||
}
 | 
			
		||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
inline int  acceleratorIsCommunicable(void *ptr)
 | 
			
		||||
@@ -359,9 +373,9 @@ inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
 | 
			
		||||
  return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
 | 
			
		||||
 | 
			
		||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 | 
			
		||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 | 
			
		||||
@@ -478,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
 | 
			
		||||
inline void *acceleratorAllocHost(size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  void *ptr=NULL;
 | 
			
		||||
  auto err = hipMallocHost((void **)&ptr,bytes);
 | 
			
		||||
  auto err = hipHostMalloc((void **)&ptr,bytes);
 | 
			
		||||
  if( err != hipSuccess ) {
 | 
			
		||||
    ptr = (void *) NULL;
 | 
			
		||||
    fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
 | 
			
		||||
@@ -516,18 +530,30 @@ inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ a
 | 
			
		||||
 | 
			
		||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
 | 
			
		||||
 | 
			
		||||
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
 | 
			
		||||
typedef int acceleratorEvent_t;
 | 
			
		||||
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
 | 
			
		||||
{
 | 
			
		||||
  auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
 | 
			
		||||
  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
 | 
			
		||||
  acceleratorCopyToDevice(from,to,bytes);
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
 | 
			
		||||
  auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
 | 
			
		||||
  acceleratorCopyFromDevice(from,to,bytes);
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
 | 
			
		||||
 | 
			
		||||
inline void acceleratorEventWait(acceleratorEvent_t ev)
 | 
			
		||||
{
 | 
			
		||||
  //  auto discard=hipStreamSynchronize(ev);
 | 
			
		||||
}
 | 
			
		||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
inline void acceleratorPin(void *ptr,unsigned long bytes)
 | 
			
		||||
@@ -564,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
 | 
			
		||||
 | 
			
		||||
#undef GRID_SIMT
 | 
			
		||||
 | 
			
		||||
typedef int acceleratorEvent_t;
 | 
			
		||||
 | 
			
		||||
inline void acceleratorMem(void)
 | 
			
		||||
{
 | 
			
		||||
  /*
 | 
			
		||||
@@ -583,9 +611,12 @@ inline void acceleratorMem(void)
 | 
			
		||||
 | 
			
		||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
 | 
			
		||||
 | 
			
		||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); }
 | 
			
		||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
 | 
			
		||||
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes);}
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { acceleratorCopyToDevice(from,to,bytes); return 0; }
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { acceleratorCopyFromDevice(from,to,bytes); return 0; }
 | 
			
		||||
inline void acceleratorEventWait(acceleratorEvent_t ev){}
 | 
			
		||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
 | 
			
		||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { thread_bcopy(from,to,bytes); return 0;}
 | 
			
		||||
 | 
			
		||||
inline void acceleratorCopySynchronise(void) {};
 | 
			
		||||
 | 
			
		||||
inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
 | 
			
		||||
@@ -668,7 +699,7 @@ accelerator_inline void acceleratorFence(void)
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes)
 | 
			
		||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
 | 
			
		||||
  acceleratorCopySynchronise();
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										22
									
								
								systems/Frontier-rocm631/config-command
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								systems/Frontier-rocm631/config-command
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,22 @@
 | 
			
		||||
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 | 
			
		||||
../../configure --enable-comms=mpi-auto \
 | 
			
		||||
--with-lime=$CLIME \
 | 
			
		||||
--enable-unified=no \
 | 
			
		||||
--enable-shm=nvlink \
 | 
			
		||||
--enable-tracing=none \
 | 
			
		||||
--enable-accelerator=hip \
 | 
			
		||||
--enable-gen-simd-width=64 \
 | 
			
		||||
--disable-gparity \
 | 
			
		||||
--disable-fermion-reps \
 | 
			
		||||
--enable-simd=GPU \
 | 
			
		||||
--with-gmp=$OLCF_GMP_ROOT \
 | 
			
		||||
--with-fftw=$FFTW_DIR/.. \
 | 
			
		||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 | 
			
		||||
--disable-fermion-reps \
 | 
			
		||||
CXX=hipcc MPICXX=mpicxx \
 | 
			
		||||
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
 | 
			
		||||
 LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										16
									
								
								systems/Frontier-rocm631/sourceme631.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								systems/Frontier-rocm631/sourceme631.sh
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,16 @@
 | 
			
		||||
 | 
			
		||||
echo spack
 | 
			
		||||
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
 | 
			
		||||
 | 
			
		||||
#module load cce/15.0.1
 | 
			
		||||
 | 
			
		||||
module load rocm/6.3.1
 | 
			
		||||
module load cray-fftw
 | 
			
		||||
module load craype-accel-amd-gfx90a
 | 
			
		||||
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 | 
			
		||||
 | 
			
		||||
#Ugly hacks to get down level software working on current system
 | 
			
		||||
#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
 | 
			
		||||
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
 | 
			
		||||
#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
 | 
			
		||||
 | 
			
		||||
@@ -30,14 +30,10 @@ source ${root}/sourceme.sh
 | 
			
		||||
 | 
			
		||||
export OMP_NUM_THREADS=7
 | 
			
		||||
export MPICH_GPU_SUPPORT_ENABLED=1
 | 
			
		||||
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 | 
			
		||||
 | 
			
		||||
for vol in 32.32.32.64
 | 
			
		||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 | 
			
		||||
#64.64.32.96
 | 
			
		||||
for vol in 64.64.32.64
 | 
			
		||||
do
 | 
			
		||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
 | 
			
		||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
 | 
			
		||||
 | 
			
		||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
 | 
			
		||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
 | 
			
		||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 | 
			
		||||
--with-lime=$CLIME \
 | 
			
		||||
--enable-unified=no \
 | 
			
		||||
--enable-shm=nvlink \
 | 
			
		||||
--enable-tracing=timer \
 | 
			
		||||
--enable-tracing=none \
 | 
			
		||||
--enable-accelerator=hip \
 | 
			
		||||
--enable-gen-simd-width=64 \
 | 
			
		||||
--disable-gparity \
 | 
			
		||||
--disable-fermion-reps \
 | 
			
		||||
--enable-simd=GPU \
 | 
			
		||||
--enable-accelerator-cshift \
 | 
			
		||||
--with-gmp=$OLCF_GMP_ROOT \
 | 
			
		||||
--with-fftw=$FFTW_DIR/.. \
 | 
			
		||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 | 
			
		||||
--disable-fermion-reps \
 | 
			
		||||
CXX=hipcc MPICXX=mpicxx \
 | 
			
		||||
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
 | 
			
		||||
 LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
 | 
			
		||||
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
 | 
			
		||||
 LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,12 +1,25 @@
 | 
			
		||||
 | 
			
		||||
echo spack
 | 
			
		||||
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
 | 
			
		||||
spack load c-lime
 | 
			
		||||
module load emacs 
 | 
			
		||||
module load PrgEnv-gnu
 | 
			
		||||
module load rocm/6.0.0
 | 
			
		||||
module load cray-mpich
 | 
			
		||||
module load gmp
 | 
			
		||||
 | 
			
		||||
module load cce/15.0.1
 | 
			
		||||
module load rocm/5.3.0
 | 
			
		||||
module load cray-fftw
 | 
			
		||||
module load craype-accel-amd-gfx90a
 | 
			
		||||
 | 
			
		||||
#Ugly hacks to get down level software working on current system
 | 
			
		||||
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
 | 
			
		||||
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 | 
			
		||||
export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
 | 
			
		||||
ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
 | 
			
		||||
 | 
			
		||||
#echo spack load c-lime
 | 
			
		||||
#spack load c-lime
 | 
			
		||||
#module load emacs 
 | 
			
		||||
##module load PrgEnv-gnu
 | 
			
		||||
##module load cray-mpich
 | 
			
		||||
##module load cray-fftw
 | 
			
		||||
##module load craype-accel-amd-gfx90a
 | 
			
		||||
##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 | 
			
		||||
#Hack for lib
 | 
			
		||||
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
 | 
			
		||||
##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
 | 
			
		||||
 
 | 
			
		||||
@@ -47,20 +47,20 @@ public:
 | 
			
		||||
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
 | 
			
		||||
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
 | 
			
		||||
  void Op     (const Field &in, Field &out){
 | 
			
		||||
    std::cout << "Op: PVdag M "<<std::endl;
 | 
			
		||||
    //    std::cout << "Op: PVdag M "<<std::endl;
 | 
			
		||||
    Field tmp(in.Grid());
 | 
			
		||||
    _Mat.M(in,tmp);
 | 
			
		||||
    _PV.Mdag(tmp,out);
 | 
			
		||||
  }
 | 
			
		||||
  void AdjOp     (const Field &in, Field &out){
 | 
			
		||||
    std::cout << "AdjOp: Mdag PV "<<std::endl;
 | 
			
		||||
    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
 | 
			
		||||
    Field tmp(in.Grid());
 | 
			
		||||
    _PV.M(in,tmp);
 | 
			
		||||
    _Mat.Mdag(tmp,out);
 | 
			
		||||
  }
 | 
			
		||||
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
 | 
			
		||||
  void HermOp(const Field &in, Field &out){
 | 
			
		||||
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
 | 
			
		||||
    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
 | 
			
		||||
    Field tmp(in.Grid());
 | 
			
		||||
    //    _Mat.M(in,tmp);
 | 
			
		||||
    //    _PV.Mdag(tmp,out);
 | 
			
		||||
@@ -83,14 +83,14 @@ public:
 | 
			
		||||
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
 | 
			
		||||
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
 | 
			
		||||
  void Op     (const Field &in, Field &out){
 | 
			
		||||
    std::cout << "Op: PVdag M "<<std::endl;
 | 
			
		||||
    //    std::cout << "Op: PVdag M "<<std::endl;
 | 
			
		||||
    Field tmp(in.Grid());
 | 
			
		||||
    _Mat.M(in,tmp);
 | 
			
		||||
    _PV.Mdag(tmp,out);
 | 
			
		||||
    out = out + shift * in;
 | 
			
		||||
  }
 | 
			
		||||
  void AdjOp     (const Field &in, Field &out){
 | 
			
		||||
    std::cout << "AdjOp: Mdag PV "<<std::endl;
 | 
			
		||||
    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
 | 
			
		||||
    Field tmp(in.Grid());
 | 
			
		||||
    _PV.M(tmp,out);
 | 
			
		||||
    _Mat.Mdag(in,tmp);
 | 
			
		||||
@@ -98,7 +98,7 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
 | 
			
		||||
  void HermOp(const Field &in, Field &out){
 | 
			
		||||
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
 | 
			
		||||
    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
 | 
			
		||||
    Field tmp(in.Grid());
 | 
			
		||||
    Op(in,tmp);
 | 
			
		||||
    AdjOp(tmp,out);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user