mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-22 17:52:02 +01:00
Compare commits
8 Commits
a93d5459d4
...
95b640cb6b
Author | SHA1 | Date | |
---|---|---|---|
95b640cb6b | |||
2cb5bedc15 | |||
806b02bddf | |||
de40395773 | |||
7ba4788715 | |||
06d9ce1a02 | |||
75bb6b2b40 | |||
74f10c2dc0 |
@ -406,6 +406,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
{
|
{
|
||||||
acceleratorCopySynchronise();
|
acceleratorCopySynchronise();
|
||||||
|
StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
|
|
||||||
|
@ -400,7 +400,6 @@ public:
|
|||||||
}
|
}
|
||||||
this->face_table_computed=1;
|
this->face_table_computed=1;
|
||||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
accelerator_barrier();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -233,10 +233,10 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
|||||||
GaugeField HUmu(_Umu.Grid());
|
GaugeField HUmu(_Umu.Grid());
|
||||||
HUmu = _Umu*(-0.5);
|
HUmu = _Umu*(-0.5);
|
||||||
if ( Dirichlet ) {
|
if ( Dirichlet ) {
|
||||||
std::cout << GridLogMessage << " Dirichlet BCs 5d " <<Block<<std::endl;
|
std::cout << GridLogDslash << " Dirichlet BCs 5d " <<Block<<std::endl;
|
||||||
Coordinate GaugeBlock(Nd);
|
Coordinate GaugeBlock(Nd);
|
||||||
for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
|
for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
|
||||||
std::cout << GridLogMessage << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
|
std::cout << GridLogDslash << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
|
||||||
DirichletFilter<GaugeField> Filter(GaugeBlock);
|
DirichletFilter<GaugeField> Filter(GaugeBlock);
|
||||||
Filter.applyFilter(HUmu);
|
Filter.applyFilter(HUmu);
|
||||||
}
|
}
|
||||||
@ -382,12 +382,14 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
|||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
DhopTotalTime-=usecond();
|
// std::cout << GridLogDslash<<"Dhop internal"<<std::endl;
|
||||||
|
DhopTotalTime=-usecond();
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
||||||
DhopTotalTime+=usecond();
|
DhopTotalTime+=usecond();
|
||||||
|
// std::cout << GridLogDslash<<"Dhop took"<<DhopTotalTime<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -404,53 +406,59 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Start comms // Gather intranode and extra node differentiated??
|
// Start comms // Gather intranode and extra node differentiated??
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime=-usecond();
|
||||||
st.HaloExchangeOptGather(in,compressor);
|
st.HaloExchangeOptGather(in,compressor);
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
// std::cout << GridLogDslash<< " Dhop Gather end "<< DhopFaceTime<<" us " <<std::endl;
|
||||||
|
|
||||||
DhopCommTime -=usecond();
|
DhopCommTime =-usecond();
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
st.CommunicateBegin(requests);
|
st.CommunicateBegin(requests);
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Overlap with comms
|
// Overlap with comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime=-usecond();
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
// std::cout << GridLogDslash<< " Dhop Commsmerge end "<<DhopFaceTime<< " us "<<std::endl;
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute interior
|
// do the compute interior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
|
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
|
||||||
DhopComputeTime-=usecond();
|
DhopComputeTime=-usecond();
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
||||||
} else {
|
} else {
|
||||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
||||||
}
|
}
|
||||||
DhopComputeTime+=usecond();
|
DhopComputeTime+=usecond();
|
||||||
|
// std::cout << GridLogDslash<< " Dhop Compute 1 end "<< DhopComputeTime<<" us" <<std::endl;
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Complete comms
|
// Complete comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
DhopCommTime +=usecond();
|
DhopCommTime +=usecond();
|
||||||
|
// std::cout << GridLogDslash<< " Dhop Comunicate end "<< DhopCommTime << " us" <<std::endl;
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute exterior
|
// do the compute exterior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime=-usecond();
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
// std::cout << GridLogDslash<< " Dhop CommsMerge2 end "<<DhopFaceTime << " us "<<std::endl;
|
||||||
|
|
||||||
DhopComputeTime2-=usecond();
|
DhopComputeTime2=-usecond();
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||||
} else {
|
} else {
|
||||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||||
}
|
}
|
||||||
DhopComputeTime2+=usecond();
|
DhopComputeTime2+=usecond();
|
||||||
|
// std::cout << GridLogDslash<< " Dhop Ext end "<<DhopComputeTime2 <<"us "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -464,11 +472,13 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
|
|||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
int LLs = in.Grid()->_rdimensions[0];
|
||||||
|
|
||||||
DhopCommTime-=usecond();
|
// std::cout << GridLogDslash<< " Dhop Halo exchange begine " <<std::endl;
|
||||||
|
DhopCommTime=-usecond();
|
||||||
st.HaloExchangeOpt(in,compressor);
|
st.HaloExchangeOpt(in,compressor);
|
||||||
DhopCommTime+=usecond();
|
DhopCommTime+=usecond();
|
||||||
|
// std::cout << GridLogDslash<< " Dhop Comms end "<<DhopCommTime<<" us"<<std::endl;
|
||||||
|
|
||||||
DhopComputeTime-=usecond();
|
DhopComputeTime=-usecond();
|
||||||
int Opt = WilsonKernelsStatic::Opt;
|
int Opt = WilsonKernelsStatic::Opt;
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
||||||
@ -476,6 +486,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
|
|||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
||||||
}
|
}
|
||||||
DhopComputeTime+=usecond();
|
DhopComputeTime+=usecond();
|
||||||
|
// std::cout << GridLogDslash<< " Dhop Compute end "<<DhopComputeTime<<" us" <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -416,19 +416,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
#undef LoopBody
|
#undef LoopBody
|
||||||
}
|
}
|
||||||
|
|
||||||
#define KERNEL_CALL_TMP(A) \
|
|
||||||
const uint64_t NN = Nsite*Ls; \
|
|
||||||
auto U_p = & U_v[0]; \
|
|
||||||
auto in_p = & in_v[0]; \
|
|
||||||
auto out_p = & out_v[0]; \
|
|
||||||
auto st_p = st_v._entries_p; \
|
|
||||||
auto st_perm = st_v._permute_type; \
|
|
||||||
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
|
||||||
int sF = ss; \
|
|
||||||
int sU = ss/Ls; \
|
|
||||||
WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p); \
|
|
||||||
}); \
|
|
||||||
accelerator_barrier();
|
|
||||||
|
|
||||||
#define KERNEL_CALLNB(A) \
|
#define KERNEL_CALLNB(A) \
|
||||||
const uint64_t NN = Nsite*Ls; \
|
const uint64_t NN = Nsite*Ls; \
|
||||||
@ -448,8 +435,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
int sF = ptr[ss]; \
|
int sF = ptr[ss]; \
|
||||||
int sU = ss/Ls; \
|
int sU = ss/Ls; \
|
||||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
||||||
}); \
|
});
|
||||||
accelerator_barrier();
|
|
||||||
|
|
||||||
#define ASM_CALL(A) \
|
#define ASM_CALL(A) \
|
||||||
thread_for( ss, Nsite, { \
|
thread_for( ss, Nsite, { \
|
||||||
@ -471,7 +457,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
#ifdef SYCL_HACK
|
#ifdef SYCL_HACK
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; }
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; }
|
||||||
#else
|
#else
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
#endif
|
#endif
|
||||||
|
@ -359,6 +359,7 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
accelerator_barrier();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
@ -371,32 +372,13 @@ public:
|
|||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
_grid->StencilSendToRecvFromComplete(MpiReqs,i);
|
_grid->StencilSendToRecvFromComplete(MpiReqs,0);
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Blocking send and receive. Either sequential or parallel.
|
// Blocking send and receive. Either sequential or parallel.
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void Communicate(void)
|
void Communicate(void)
|
||||||
{
|
{
|
||||||
if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
|
|
||||||
/////////////////////////////////////////////////////////
|
|
||||||
// several way threaded on different communicators.
|
|
||||||
// Cannot combine with Dirichlet operators
|
|
||||||
// This scheme is needed on Intel Omnipath for best performance
|
|
||||||
// Deprecate once there are very few omnipath clusters
|
|
||||||
/////////////////////////////////////////////////////////
|
|
||||||
int nthreads = CartesianCommunicator::nCommThreads;
|
|
||||||
int old = GridThread::GetThreads();
|
|
||||||
GridThread::SetThreads(nthreads);
|
|
||||||
thread_for(i,Packets.size(),{
|
|
||||||
_grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
|
||||||
Packets[i].to_rank,Packets[i].do_send,
|
|
||||||
Packets[i].recv_buf,
|
|
||||||
Packets[i].from_rank,Packets[i].do_recv,
|
|
||||||
Packets[i].bytes,i);
|
|
||||||
});
|
|
||||||
GridThread::SetThreads(old);
|
|
||||||
} else {
|
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
// Concurrent and non-threaded asynch calls to MPI
|
// Concurrent and non-threaded asynch calls to MPI
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
@ -404,7 +386,6 @@ public:
|
|||||||
this->CommunicateBegin(reqs);
|
this->CommunicateBegin(reqs);
|
||||||
this->CommunicateComplete(reqs);
|
this->CommunicateComplete(reqs);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
@ -483,7 +464,6 @@ public:
|
|||||||
face_table_computed=1;
|
face_table_computed=1;
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
|
|
||||||
accelerator_barrier();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////
|
/////////////////////////
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
int world_rank; // Use to control world rank for print guarding
|
||||||
int acceleratorAbortOnGpuError=1;
|
int acceleratorAbortOnGpuError=1;
|
||||||
uint32_t accelerator_threads=2;
|
uint32_t accelerator_threads=2;
|
||||||
uint32_t acceleratorThreads(void) {return accelerator_threads;};
|
uint32_t acceleratorThreads(void) {return accelerator_threads;};
|
||||||
@ -16,7 +17,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
|||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
cudaDeviceProp *gpu_props;
|
cudaDeviceProp *gpu_props;
|
||||||
cudaStream_t copyStream;
|
cudaStream_t copyStream;
|
||||||
cudaStream_t cpuStream;
|
cudaStream_t computeStream;
|
||||||
void acceleratorInit(void)
|
void acceleratorInit(void)
|
||||||
{
|
{
|
||||||
int nDevices = 1;
|
int nDevices = 1;
|
||||||
@ -24,7 +25,8 @@ void acceleratorInit(void)
|
|||||||
gpu_props = new cudaDeviceProp[nDevices];
|
gpu_props = new cudaDeviceProp[nDevices];
|
||||||
|
|
||||||
char * localRankStr = NULL;
|
char * localRankStr = NULL;
|
||||||
int rank = 0, world_rank=0;
|
int rank = 0;
|
||||||
|
world_rank=0;
|
||||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
@ -99,7 +101,7 @@ void acceleratorInit(void)
|
|||||||
|
|
||||||
cudaSetDevice(device);
|
cudaSetDevice(device);
|
||||||
cudaStreamCreate(©Stream);
|
cudaStreamCreate(©Stream);
|
||||||
cudaStreamCreate(&cpuStream);
|
cudaStreamCreate(&computeStream);
|
||||||
const int len=64;
|
const int len=64;
|
||||||
char busid[len];
|
char busid[len];
|
||||||
if( rank == world_rank ) {
|
if( rank == world_rank ) {
|
||||||
@ -114,7 +116,7 @@ void acceleratorInit(void)
|
|||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
hipDeviceProp_t *gpu_props;
|
hipDeviceProp_t *gpu_props;
|
||||||
hipStream_t copyStream;
|
hipStream_t copyStream;
|
||||||
hipStream_t cpuStream;
|
hipStream_t computeStream;
|
||||||
void acceleratorInit(void)
|
void acceleratorInit(void)
|
||||||
{
|
{
|
||||||
int nDevices = 1;
|
int nDevices = 1;
|
||||||
@ -122,7 +124,8 @@ void acceleratorInit(void)
|
|||||||
gpu_props = new hipDeviceProp_t[nDevices];
|
gpu_props = new hipDeviceProp_t[nDevices];
|
||||||
|
|
||||||
char * localRankStr = NULL;
|
char * localRankStr = NULL;
|
||||||
int rank = 0, world_rank=0;
|
int rank = 0;
|
||||||
|
world_rank=0;
|
||||||
// We extract the local rank initialization using an environment variable
|
// We extract the local rank initialization using an environment variable
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||||
{
|
{
|
||||||
@ -183,7 +186,7 @@ void acceleratorInit(void)
|
|||||||
#endif
|
#endif
|
||||||
hipSetDevice(device);
|
hipSetDevice(device);
|
||||||
hipStreamCreate(©Stream);
|
hipStreamCreate(©Stream);
|
||||||
hipStreamCreate(&cpuStream);
|
hipStreamCreate(&computeStream);
|
||||||
const int len=64;
|
const int len=64;
|
||||||
char busid[len];
|
char busid[len];
|
||||||
if( rank == world_rank ) {
|
if( rank == world_rank ) {
|
||||||
@ -210,7 +213,8 @@ void acceleratorInit(void)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
char * localRankStr = NULL;
|
char * localRankStr = NULL;
|
||||||
int rank = 0, world_rank=0;
|
int rank = 0;
|
||||||
|
world_rank=0;
|
||||||
|
|
||||||
// We extract the local rank initialization using an environment variable
|
// We extract the local rank initialization using an environment variable
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||||
|
@ -107,7 +107,7 @@ void acceleratorInit(void);
|
|||||||
|
|
||||||
extern int acceleratorAbortOnGpuError;
|
extern int acceleratorAbortOnGpuError;
|
||||||
extern cudaStream_t copyStream;
|
extern cudaStream_t copyStream;
|
||||||
extern cudaStream_t cpuStream;
|
extern cudaStream_t computeStream;
|
||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
@ -135,7 +135,7 @@ inline void cuda_mem(void)
|
|||||||
}; \
|
}; \
|
||||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||||
LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda); \
|
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define accelerator_for6dNB(iter1, num1, \
|
#define accelerator_for6dNB(iter1, num1, \
|
||||||
@ -154,7 +154,7 @@ inline void cuda_mem(void)
|
|||||||
}; \
|
}; \
|
||||||
dim3 cu_blocks (num1,num2,num3); \
|
dim3 cu_blocks (num1,num2,num3); \
|
||||||
dim3 cu_threads(num4,num5,num6); \
|
dim3 cu_threads(num4,num5,num6); \
|
||||||
Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename lambda> __global__
|
template<typename lambda> __global__
|
||||||
@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
|||||||
|
|
||||||
#define accelerator_barrier(dummy) \
|
#define accelerator_barrier(dummy) \
|
||||||
{ \
|
{ \
|
||||||
cudaDeviceSynchronize(); \
|
cudaStreamSynchronize(computeStream); \
|
||||||
cudaError err = cudaGetLastError(); \
|
cudaError err = cudaGetLastError(); \
|
||||||
if ( cudaSuccess != err ) { \
|
if ( cudaSuccess != err ) { \
|
||||||
printf("accelerator_barrier(): Cuda error %s \n", \
|
printf("accelerator_barrier(): Cuda error %s \n", \
|
||||||
@ -340,7 +340,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
#define accelerator_inline __host__ __device__ inline
|
#define accelerator_inline __host__ __device__ inline
|
||||||
|
|
||||||
extern hipStream_t copyStream;
|
extern hipStream_t copyStream;
|
||||||
extern hipStream_t cpuStream;
|
extern hipStream_t computeStream;
|
||||||
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
@ -362,16 +362,15 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
|
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
|
||||||
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
|
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
|
||||||
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
|
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
|
||||||
0,0/*cpuStream*/, \
|
0,computeStream, \
|
||||||
num1,num2,nsimd, lambda); \
|
num1,num2,nsimd, lambda); \
|
||||||
} else { \
|
} else { \
|
||||||
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
|
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
|
||||||
0,0/*cpuStream*/, \
|
0,computeStream, \
|
||||||
num1,num2,nsimd, lambda); \
|
num1,num2,nsimd, lambda); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<typename lambda> __global__
|
template<typename lambda> __global__
|
||||||
__launch_bounds__(64,1)
|
__launch_bounds__(64,1)
|
||||||
void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
||||||
@ -400,7 +399,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
|||||||
|
|
||||||
#define accelerator_barrier(dummy) \
|
#define accelerator_barrier(dummy) \
|
||||||
{ \
|
{ \
|
||||||
hipStreamSynchronize(cpuStream); \
|
hipStreamSynchronize(computeStream); \
|
||||||
auto err = hipGetLastError(); \
|
auto err = hipGetLastError(); \
|
||||||
if ( err != hipSuccess ) { \
|
if ( err != hipSuccess ) { \
|
||||||
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
|
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
|
||||||
@ -443,7 +442,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
|
|||||||
|
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);
|
hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
|
||||||
|
|
||||||
|
@ -332,9 +332,9 @@ int main(int argc, char **argv) {
|
|||||||
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp);
|
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp);
|
||||||
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp);
|
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp);
|
||||||
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp);
|
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp);
|
||||||
Level1.push_back(&StrangePseudoFermionBdy);
|
Level1.push_back(&StrangePseudoFermionBdy); // ok
|
||||||
Level2.push_back(&StrangePseudoFermionLocal);
|
Level2.push_back(&StrangePseudoFermionLocal);
|
||||||
Level1.push_back(&StrangePseudoFermionPVBdy);
|
Level1.push_back(&StrangePseudoFermionPVBdy); //ok
|
||||||
|
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// up down action
|
// up down action
|
||||||
@ -436,6 +436,10 @@ int main(int argc, char **argv) {
|
|||||||
*Numerators[h],*Denominators[h],
|
*Numerators[h],*Denominators[h],
|
||||||
*NumeratorsF[h],*DenominatorsF[h],
|
*NumeratorsF[h],*DenominatorsF[h],
|
||||||
OFRp, 500) );
|
OFRp, 500) );
|
||||||
|
Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
|
||||||
|
*Numerators[h],*Denominators[h],
|
||||||
|
*NumeratorsF[h],*DenominatorsF[h],
|
||||||
|
OFRp, 500) );
|
||||||
#else
|
#else
|
||||||
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||||
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||||
|
Reference in New Issue
Block a user