mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
commit
80fd6ab407
@ -34,6 +34,12 @@
|
||||
#define __SYCL__REDEFINE__
|
||||
#endif
|
||||
|
||||
/* HIP save and restore compile environment*/
|
||||
#ifdef GRID_HIP
|
||||
#pragma push
|
||||
#pragma push_macro("__HIP_DEVICE_COMPILE__")
|
||||
#endif
|
||||
#define EIGEN_NO_HIP
|
||||
|
||||
#include <Grid/Eigen/Dense>
|
||||
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
||||
@ -52,6 +58,12 @@
|
||||
#pragma pop
|
||||
#endif
|
||||
|
||||
/*HIP restore*/
|
||||
#ifdef __HIP__REDEFINE__
|
||||
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
|
||||
#pragma pop
|
||||
#endif
|
||||
|
||||
#if defined __GNUC__
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
@ -138,21 +138,6 @@ public:
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
|
||||
void SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int xmit_to_rank,
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
|
||||
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
|
||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||
|
||||
double StencilSendToRecvFrom(void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
|
@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
|
||||
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
||||
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
||||
|
||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int xmit_to_rank,
|
||||
int recv_from_rank,
|
||||
int bytes)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
// Basic Halo comms primitive -- should never call in single node
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
||||
{
|
||||
bcopy(in,out,bytes*words);
|
||||
@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int recv_from_rank,
|
||||
int bytes, int dir)
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
// Discard the "dir"
|
||||
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||
SendToRecvFromComplete(list);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
int recv_from_rank,
|
||||
int bytes, int dir)
|
||||
{
|
||||
// Discard the "dir"
|
||||
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
SendToRecvFromComplete(waitall);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::StencilBarrier(void){};
|
||||
|
@ -32,6 +32,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#ifdef GRID_CUDA
|
||||
#include <cuda_runtime_api.h>
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
#define header "SharedMemoryMpi: "
|
||||
@ -425,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Hugetlbfs mapping intended
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_CUDA
|
||||
#if defined(GRID_CUDA) ||defined(GRID_HIP)
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
void * ShmCommBuf ;
|
||||
@ -448,21 +451,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Each MPI rank should allocate our own buffer
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifndef GRID_MPI3_SHM_NONE
|
||||
auto err = cudaMalloc(&ShmCommBuf, bytes);
|
||||
#else
|
||||
auto err = cudaMallocManaged(&ShmCommBuf, bytes);
|
||||
#endif
|
||||
if ( err != cudaSuccess) {
|
||||
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if ( WorldRank == 0 ){
|
||||
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes
|
||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||
}
|
||||
SharedMemoryZero(ShmCommBuf,bytes);
|
||||
|
||||
@ -475,15 +472,26 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
//////////////////////////////////////////////////
|
||||
// If it is me, pass around the IPC access key
|
||||
//////////////////////////////////////////////////
|
||||
#ifdef GRID_CUDA
|
||||
cudaIpcMemHandle_t handle;
|
||||
|
||||
if ( r==WorldShmRank ) {
|
||||
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
||||
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
||||
if ( err != cudaSuccess) {
|
||||
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
hipIpcMemHandle_t handle;
|
||||
if ( r==WorldShmRank ) {
|
||||
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
|
||||
if ( err != hipSuccess) {
|
||||
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
//////////////////////////////////////////////////
|
||||
// Share this IPC handle across the Shm Comm
|
||||
//////////////////////////////////////////////////
|
||||
@ -500,13 +508,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
// If I am not the source, overwrite thisBuf with remote buffer
|
||||
///////////////////////////////////////////////////////////////
|
||||
void * thisBuf = ShmCommBuf;
|
||||
#ifdef GRID_CUDA
|
||||
if ( r!=WorldShmRank ) {
|
||||
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
||||
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
||||
if ( err != cudaSuccess) {
|
||||
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
if ( r!=WorldShmRank ) {
|
||||
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
|
||||
if ( err != hipSuccess) {
|
||||
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Save a copy of the device buffers
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
autoView( rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto lhs_t=lhs_v(ss);
|
||||
auto rhs_t=rhs_v(ss);
|
||||
auto tmp =ret_v(ss);
|
||||
mac(&tmp,&lhs_t,&rhs_t);
|
||||
coalescedWrite(ret_v[ss],tmp);
|
||||
});
|
||||
@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( lhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto tmp =ret_v(ss);
|
||||
auto lhs_t=lhs_v(ss);
|
||||
mac(&tmp,&lhs_t,&rhs);
|
||||
coalescedWrite(ret_v[ss],tmp);
|
||||
@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||
autoView( ret_v , ret, AcceleratorWrite);
|
||||
autoView( rhs_v , lhs, AcceleratorRead);
|
||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||
decltype(coalescedRead(obj1())) tmp;
|
||||
auto tmp =ret_v(ss);
|
||||
auto rhs_t=rhs_v(ss);
|
||||
mac(&tmp,&lhs,&rhs_t);
|
||||
coalescedWrite(ret_v[ss],tmp);
|
||||
|
@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
extern hipDeviceProp_t *gpu_props;
|
||||
#define WARP_SIZE 64
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
extern cudaDeviceProp *gpu_props;
|
||||
#define WARP_SIZE 32
|
||||
#endif
|
||||
|
||||
#define WARP_SIZE 32
|
||||
__device__ unsigned int retirementCount = 0;
|
||||
|
||||
template <class Iterator>
|
||||
@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
||||
|
||||
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
||||
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
||||
__syncwarp();
|
||||
acceleratorSynchronise();
|
||||
|
||||
const Iterator VEC = WARP_SIZE;
|
||||
const Iterator vid = tid & (VEC-1);
|
||||
@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
||||
beta += temp;
|
||||
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
||||
}
|
||||
__syncwarp();
|
||||
acceleratorSynchronise();
|
||||
}
|
||||
__syncthreads();
|
||||
acceleratorSynchroniseAll();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
beta = Zero();
|
||||
@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
||||
}
|
||||
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
||||
}
|
||||
__syncthreads();
|
||||
acceleratorSynchroniseAll();
|
||||
}
|
||||
|
||||
|
||||
|
@ -63,17 +63,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Generic Nc kernels
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
template<int Naik> accelerator_inline
|
||||
template<int Naik>
|
||||
static accelerator_inline
|
||||
void DhopSiteGeneric(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
template<int Naik> accelerator_inline
|
||||
|
||||
template<int Naik> static accelerator_inline
|
||||
void DhopSiteGenericInt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
template<int Naik> accelerator_inline
|
||||
|
||||
template<int Naik> static accelerator_inline
|
||||
void DhopSiteGenericExt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
@ -82,17 +85,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Nc=3 specific kernels
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
template<int Naik> accelerator_inline
|
||||
|
||||
template<int Naik> static accelerator_inline
|
||||
void DhopSiteHand(StencilView &st,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
template<int Naik> accelerator_inline
|
||||
|
||||
template<int Naik> static accelerator_inline
|
||||
void DhopSiteHandInt(StencilView &st,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
template<int Naik> accelerator_inline
|
||||
|
||||
template<int Naik> static accelerator_inline
|
||||
void DhopSiteHandExt(StencilView &st,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
@ -101,6 +107,7 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Asm Nc=3 specific kernels
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void DhopSiteAsm(StencilView &st,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
|
@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
|
||||
PropagatorField tmp(UGrid);
|
||||
PropagatorField Utmp(UGrid);
|
||||
LatticeInteger zz (UGrid); zz=0.0;
|
||||
PropagatorField zz (UGrid); zz=0.0;
|
||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||
for (int s=0;s<Ls;s++) {
|
||||
|
||||
@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField tmp(UGrid);
|
||||
PropagatorField Utmp(UGrid);
|
||||
|
||||
LatticeInteger zz (UGrid); zz=0.0;
|
||||
PropagatorField zz (UGrid); zz=0.0;
|
||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||
|
||||
for(int s=0;s<Ls;s++){
|
||||
|
@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
template <class Impl>
|
||||
template <int Naik>
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
||||
|
||||
|
||||
template <class Impl>
|
||||
template <int Naik>
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
||||
|
||||
|
||||
template <class Impl>
|
||||
template <int Naik>
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
|
@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
||||
// Int, Ext, Int+Ext cases for comms overlap
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
template <int Naik>
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
||||
// Only contributions from interior of our node
|
||||
///////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
template <int Naik>
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
||||
// Only contributions from exterior of our node
|
||||
///////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
template <int Naik>
|
||||
template <int Naik> accelerator_inline
|
||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int sF, int sU,
|
||||
|
@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
HAND_RESULT_EXT(ss,F)
|
||||
|
||||
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
nmu = 0; \
|
||||
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
template<> void \
|
||||
template<> accelerator_inline void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
|
@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl> void
|
||||
template<class Impl> accelerator_inline void
|
||||
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
template<class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
template<class Impl> accelerator_inline void
|
||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
template<class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
|
||||
HAND_RESULT(ss);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
template<class Impl> accelerator_inline void
|
||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
|
||||
HAND_RESULT_EXT(ss);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
template<class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
|
@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// All legs kernels ; comms then compute
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
|
||||
coalescedWrite(out[sF],result,lane);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Interior kernels
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
|
||||
coalescedWrite(out[sF], result,lane);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Exterior kernels
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
|
||||
}
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
||||
};
|
||||
|
||||
#define DhopDirMacro(Dir,spProj,spRecon) \
|
||||
template <class Impl> \
|
||||
template <class Impl> accelerator_inline \
|
||||
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
||||
{ \
|
||||
@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
|
||||
DhopDirMacro(Zm,spProjZm,spReconZm);
|
||||
DhopDirMacro(Tm,spProjTm,spReconTm);
|
||||
|
||||
template <class Impl>
|
||||
template <class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
||||
{
|
||||
|
@ -41,6 +41,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||
typedef struct { uint16_t x;} half;
|
||||
#endif
|
||||
typedef struct Half2_t { half x; half y; } Half2;
|
||||
|
||||
#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
|
||||
|
||||
template<class pair>
|
||||
@ -125,14 +130,14 @@ inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const
|
||||
}
|
||||
|
||||
constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half);
|
||||
constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2);
|
||||
constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2);
|
||||
constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float);
|
||||
constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
|
||||
constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double);
|
||||
constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
|
||||
constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer);
|
||||
|
||||
typedef GpuComplex<half2 > GpuComplexH;
|
||||
typedef GpuComplex<Half2 > GpuComplexH;
|
||||
typedef GpuComplex<float2 > GpuComplexF;
|
||||
typedef GpuComplex<double2> GpuComplexD;
|
||||
|
||||
@ -147,11 +152,9 @@ typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
|
||||
accelerator_inline float half2float(half h)
|
||||
{
|
||||
float f;
|
||||
#ifdef GRID_SIMT
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
f = __half2float(h);
|
||||
#else
|
||||
//f = __half2float(h);
|
||||
__half_raw hr(h);
|
||||
Grid_half hh;
|
||||
hh.x = hr.x;
|
||||
f= sfw_half_to_float(hh);
|
||||
@ -161,13 +164,11 @@ accelerator_inline float half2float(half h)
|
||||
accelerator_inline half float2half(float f)
|
||||
{
|
||||
half h;
|
||||
#ifdef GRID_SIMT
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
h = __float2half(f);
|
||||
#else
|
||||
Grid_half hh = sfw_float_to_half(f);
|
||||
__half_raw hr;
|
||||
hr.x = hh.x;
|
||||
h = __half(hr);
|
||||
h.x = hh.x;
|
||||
#endif
|
||||
return h;
|
||||
}
|
||||
@ -523,7 +524,7 @@ namespace Optimization {
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Single / Half
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
|
||||
static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
|
||||
int N = GpuVectorCF::N;
|
||||
GpuVectorCH h;
|
||||
for(int i=0;i<N;i++) {
|
||||
|
@ -59,6 +59,7 @@ void acceleratorInit(void)
|
||||
printf("AcceleratorCudaInit[%d]: ========================\n",rank);
|
||||
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
|
||||
|
||||
|
||||
GPU_PROP_FMT(totalGlobalMem,"%lld");
|
||||
GPU_PROP(managedMemory);
|
||||
GPU_PROP(isMultiGpuBoard);
|
||||
@ -113,20 +114,24 @@ void acceleratorInit(void)
|
||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||
|
||||
printf("world_rank %d has %d devices\n",world_rank,nDevices);
|
||||
size_t totalDeviceMem=0;
|
||||
for (int i = 0; i < nDevices; i++) {
|
||||
|
||||
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
||||
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||
|
||||
hipGetDeviceProperties(&gpu_props[i], i);
|
||||
hipDeviceProp_t prop;
|
||||
prop = gpu_props[i];
|
||||
totalDeviceMem = prop.totalGlobalMem;
|
||||
if ( world_rank == 0) {
|
||||
hipDeviceProp_t prop;
|
||||
prop = gpu_props[i];
|
||||
printf("AcceleratorHipInit: ========================\n");
|
||||
printf("AcceleratorHipInit: Device Number : %d\n", i);
|
||||
printf("AcceleratorHipInit: ========================\n");
|
||||
printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
|
||||
|
||||
GPU_PROP_FMT(totalGlobalMem,"%lu");
|
||||
// GPU_PROP(managedMemory);
|
||||
GPU_PROP(isMultiGpuBoard);
|
||||
GPU_PROP(warpSize);
|
||||
@ -135,6 +140,7 @@ void acceleratorInit(void)
|
||||
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||
}
|
||||
}
|
||||
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
|
||||
#undef GPU_PROP_FMT
|
||||
#undef GPU_PROP
|
||||
#ifdef GRID_IBM_SUMMIT
|
||||
|
@ -307,17 +307,13 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
||||
|
||||
inline void *acceleratorAllocShared(size_t bytes)
|
||||
{
|
||||
#if 0
|
||||
void *ptr=NULL;
|
||||
auto err = hipMallocManaged((void **)&ptr,bytes);
|
||||
if( err != hipSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
|
||||
printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
#else
|
||||
return malloc(bytes);
|
||||
#endif
|
||||
};
|
||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||
|
||||
@ -327,7 +323,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
auto err = hipMalloc((void **)&ptr,bytes);
|
||||
if( err != hipSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
|
||||
printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
|
@ -36,12 +36,12 @@ int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
#define LMAX (48)
|
||||
#define LMAX (40)
|
||||
#define LMIN (8)
|
||||
#define LADD (8)
|
||||
|
||||
int64_t Nwarm=50;
|
||||
int64_t Nloop=500;
|
||||
int64_t Nwarm=10;
|
||||
int64_t Nloop=100;
|
||||
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
@ -118,6 +118,41 @@ int main (int argc, char ** argv)
|
||||
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 z=z+ x*y"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
|
||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||
|
||||
for(int lat=LMIN;lat<=LMAX;lat+=LADD){
|
||||
|
||||
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
||||
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||
|
||||
LatticeColourMatrix z(&Grid); random(pRNG,z);
|
||||
LatticeColourMatrix x(&Grid); random(pRNG,x);
|
||||
LatticeColourMatrix y(&Grid); random(pRNG,y);
|
||||
|
||||
for(int64_t i=0;i<Nwarm;i++){
|
||||
z=z+x*y;
|
||||
}
|
||||
double start=usecond();
|
||||
for(int64_t i=0;i<Nloop;i++){
|
||||
z=z+x*y;
|
||||
}
|
||||
double stop=usecond();
|
||||
double time = (stop-start)/Nloop*1000.0;
|
||||
|
||||
double bytes=4*vol*Nc*Nc*sizeof(Complex);
|
||||
double flops=Nc*Nc*(6+8+8)*vol;
|
||||
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 mult(z,x,y)"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
@ -143,7 +178,6 @@ int main (int argc, char ** argv)
|
||||
double start=usecond();
|
||||
for(int64_t i=0;i<Nloop;i++){
|
||||
mult(z,x,y);
|
||||
// mac(z,x,y);
|
||||
}
|
||||
double stop=usecond();
|
||||
double time = (stop-start)/Nloop*1000.0;
|
||||
|
@ -187,7 +187,8 @@ int main (int argc, char ** argv)
|
||||
auto xx = coalescedRead(x_v[ss]);
|
||||
auto yy = coalescedRead(y_v[ss]);
|
||||
auto zz = coalescedRead(z_v[ss]);
|
||||
zz = zz+xx*yy;
|
||||
//zz = zz+xx*yy;
|
||||
mac(&zz,&xx,&yy);
|
||||
coalescedWrite(z_v[ss],zz);
|
||||
});
|
||||
}
|
||||
|
@ -330,12 +330,18 @@ case ${CXXTEST} in
|
||||
fi
|
||||
;;
|
||||
hipcc)
|
||||
CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
|
||||
# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
|
||||
CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
|
||||
CXXLD=${CXX}
|
||||
if test $ac_openmp = yes; then
|
||||
CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
|
||||
fi
|
||||
;;
|
||||
dpcpp)
|
||||
LDFLAGS="$LDFLAGS"
|
||||
CXXFLAGS="$CXXFLAGS"
|
||||
CXXLD=${CXX}
|
||||
;;
|
||||
*)
|
||||
CXXLD=${CXX}
|
||||
CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
|
||||
|
@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu
|
||||
|
||||
This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:
|
||||
|
||||
../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none
|
||||
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug
|
||||
|
||||
#### 2. `Release`
|
||||
|
||||
Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):
|
||||
|
||||
../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none
|
||||
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release
|
||||
|
||||
#### 3. `MPIDebug`
|
||||
|
||||
Debug configuration with MPI:
|
||||
|
||||
../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx
|
||||
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug
|
||||
|
||||
### 5.3 Build Grid
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user