1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-22 09:42:02 +01:00

Compare commits

...

8 Commits

8 changed files with 60 additions and 76 deletions

View File

@ -406,6 +406,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
acceleratorCopySynchronise(); acceleratorCopySynchronise();
StencilBarrier();// Synch shared memory on a single nodes
int nreq=list.size(); int nreq=list.size();

View File

@ -400,7 +400,6 @@ public:
} }
this->face_table_computed=1; this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size); assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier();
} }
}; };

View File

@ -233,10 +233,10 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
GaugeField HUmu(_Umu.Grid()); GaugeField HUmu(_Umu.Grid());
HUmu = _Umu*(-0.5); HUmu = _Umu*(-0.5);
if ( Dirichlet ) { if ( Dirichlet ) {
std::cout << GridLogMessage << " Dirichlet BCs 5d " <<Block<<std::endl; std::cout << GridLogDslash << " Dirichlet BCs 5d " <<Block<<std::endl;
Coordinate GaugeBlock(Nd); Coordinate GaugeBlock(Nd);
for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1]; for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
std::cout << GridLogMessage << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl; std::cout << GridLogDslash << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
DirichletFilter<GaugeField> Filter(GaugeBlock); DirichletFilter<GaugeField> Filter(GaugeBlock);
Filter.applyFilter(HUmu); Filter.applyFilter(HUmu);
} }
@ -382,12 +382,14 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U, DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
DhopTotalTime-=usecond(); // std::cout << GridLogDslash<<"Dhop internal"<<std::endl;
DhopTotalTime=-usecond();
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag); DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else else
DhopInternalSerialComms(st,lo,U,in,out,dag); DhopInternalSerialComms(st,lo,U,in,out,dag);
DhopTotalTime+=usecond(); DhopTotalTime+=usecond();
// std::cout << GridLogDslash<<"Dhop took"<<DhopTotalTime<<std::endl;
} }
@ -404,53 +406,59 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
///////////////////////////// /////////////////////////////
// Start comms // Gather intranode and extra node differentiated?? // Start comms // Gather intranode and extra node differentiated??
///////////////////////////// /////////////////////////////
DhopFaceTime-=usecond(); DhopFaceTime=-usecond();
st.HaloExchangeOptGather(in,compressor); st.HaloExchangeOptGather(in,compressor);
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
// std::cout << GridLogDslash<< " Dhop Gather end "<< DhopFaceTime<<" us " <<std::endl;
DhopCommTime -=usecond(); DhopCommTime =-usecond();
std::vector<std::vector<CommsRequest_t> > requests; std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests); st.CommunicateBegin(requests);
///////////////////////////// /////////////////////////////
// Overlap with comms // Overlap with comms
///////////////////////////// /////////////////////////////
DhopFaceTime-=usecond(); DhopFaceTime=-usecond();
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
// std::cout << GridLogDslash<< " Dhop Commsmerge end "<<DhopFaceTime<< " us "<<std::endl;
///////////////////////////// /////////////////////////////
// do the compute interior // do the compute interior
///////////////////////////// /////////////////////////////
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
DhopComputeTime-=usecond(); DhopComputeTime=-usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
} else { } else {
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
} }
DhopComputeTime+=usecond(); DhopComputeTime+=usecond();
// std::cout << GridLogDslash<< " Dhop Compute 1 end "<< DhopComputeTime<<" us" <<std::endl;
///////////////////////////// /////////////////////////////
// Complete comms // Complete comms
///////////////////////////// /////////////////////////////
st.CommunicateComplete(requests); st.CommunicateComplete(requests);
DhopCommTime +=usecond(); DhopCommTime +=usecond();
// std::cout << GridLogDslash<< " Dhop Comunicate end "<< DhopCommTime << " us" <<std::endl;
///////////////////////////// /////////////////////////////
// do the compute exterior // do the compute exterior
///////////////////////////// /////////////////////////////
DhopFaceTime-=usecond(); DhopFaceTime=-usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
// std::cout << GridLogDslash<< " Dhop CommsMerge2 end "<<DhopFaceTime << " us "<<std::endl;
DhopComputeTime2-=usecond(); DhopComputeTime2=-usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
} else { } else {
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
} }
DhopComputeTime2+=usecond(); DhopComputeTime2+=usecond();
// std::cout << GridLogDslash<< " Dhop Ext end "<<DhopComputeTime2 <<"us "<<std::endl;
} }
@ -463,12 +471,14 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
Compressor compressor(dag); Compressor compressor(dag);
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
DhopCommTime-=usecond(); // std::cout << GridLogDslash<< " Dhop Halo exchange begine " <<std::endl;
DhopCommTime=-usecond();
st.HaloExchangeOpt(in,compressor); st.HaloExchangeOpt(in,compressor);
DhopCommTime+=usecond(); DhopCommTime+=usecond();
// std::cout << GridLogDslash<< " Dhop Comms end "<<DhopCommTime<<" us"<<std::endl;
DhopComputeTime-=usecond(); DhopComputeTime=-usecond();
int Opt = WilsonKernelsStatic::Opt; int Opt = WilsonKernelsStatic::Opt;
if (dag == DaggerYes) { if (dag == DaggerYes) {
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
@ -476,6 +486,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
} }
DhopComputeTime+=usecond(); DhopComputeTime+=usecond();
// std::cout << GridLogDslash<< " Dhop Compute end "<<DhopComputeTime<<" us" <<std::endl;
} }

View File

@ -416,19 +416,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#undef LoopBody #undef LoopBody
} }
#define KERNEL_CALL_TMP(A) \
const uint64_t NN = Nsite*Ls; \
auto U_p = & U_v[0]; \
auto in_p = & in_v[0]; \
auto out_p = & out_v[0]; \
auto st_p = st_v._entries_p; \
auto st_perm = st_v._permute_type; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p); \
}); \
accelerator_barrier();
#define KERNEL_CALLNB(A) \ #define KERNEL_CALLNB(A) \
const uint64_t NN = Nsite*Ls; \ const uint64_t NN = Nsite*Ls; \
@ -448,8 +435,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
int sF = ptr[ss]; \ int sF = ptr[ss]; \
int sU = ss/Ls; \ int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
}); \ });
accelerator_barrier();
#define ASM_CALL(A) \ #define ASM_CALL(A) \
thread_for( ss, Nsite, { \ thread_for( ss, Nsite, { \
@ -471,7 +457,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) { if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifdef SYCL_HACK #ifdef SYCL_HACK
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; } if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; }
#else #else
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#endif #endif

View File

@ -359,6 +359,7 @@ public:
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
accelerator_barrier();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs, _grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf, Packets[i].send_buf,
@ -371,39 +372,19 @@ public:
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
_grid->StencilSendToRecvFromComplete(MpiReqs,i); _grid->StencilSendToRecvFromComplete(MpiReqs,0);
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel. // Blocking send and receive. Either sequential or parallel.
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void Communicate(void) void Communicate(void)
{ {
if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){ /////////////////////////////////////////////////////////
///////////////////////////////////////////////////////// // Concurrent and non-threaded asynch calls to MPI
// several way threaded on different communicators. /////////////////////////////////////////////////////////
// Cannot combine with Dirichlet operators std::vector<std::vector<CommsRequest_t> > reqs;
// This scheme is needed on Intel Omnipath for best performance this->CommunicateBegin(reqs);
// Deprecate once there are very few omnipath clusters this->CommunicateComplete(reqs);
/////////////////////////////////////////////////////////
int nthreads = CartesianCommunicator::nCommThreads;
int old = GridThread::GetThreads();
GridThread::SetThreads(nthreads);
thread_for(i,Packets.size(),{
_grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].bytes,i);
});
GridThread::SetThreads(old);
} else {
/////////////////////////////////////////////////////////
// Concurrent and non-threaded asynch calls to MPI
/////////////////////////////////////////////////////////
std::vector<std::vector<CommsRequest_t> > reqs;
this->CommunicateBegin(reqs);
this->CommunicateComplete(reqs);
}
} }
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
@ -483,7 +464,6 @@ public:
face_table_computed=1; face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size); assert(u_comm_offset==_unified_buffer_size);
accelerator_barrier();
} }
///////////////////////// /////////////////////////

View File

@ -1,6 +1,7 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
int world_rank; // Use to control world rank for print guarding
int acceleratorAbortOnGpuError=1; int acceleratorAbortOnGpuError=1;
uint32_t accelerator_threads=2; uint32_t accelerator_threads=2;
uint32_t acceleratorThreads(void) {return accelerator_threads;}; uint32_t acceleratorThreads(void) {return accelerator_threads;};
@ -16,7 +17,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#ifdef GRID_CUDA #ifdef GRID_CUDA
cudaDeviceProp *gpu_props; cudaDeviceProp *gpu_props;
cudaStream_t copyStream; cudaStream_t copyStream;
cudaStream_t cpuStream; cudaStream_t computeStream;
void acceleratorInit(void) void acceleratorInit(void)
{ {
int nDevices = 1; int nDevices = 1;
@ -24,7 +25,8 @@ void acceleratorInit(void)
gpu_props = new cudaDeviceProp[nDevices]; gpu_props = new cudaDeviceProp[nDevices];
char * localRankStr = NULL; char * localRankStr = NULL;
int rank = 0, world_rank=0; int rank = 0;
world_rank=0;
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
@ -99,7 +101,7 @@ void acceleratorInit(void)
cudaSetDevice(device); cudaSetDevice(device);
cudaStreamCreate(&copyStream); cudaStreamCreate(&copyStream);
cudaStreamCreate(&cpuStream); cudaStreamCreate(&computeStream);
const int len=64; const int len=64;
char busid[len]; char busid[len];
if( rank == world_rank ) { if( rank == world_rank ) {
@ -114,7 +116,7 @@ void acceleratorInit(void)
#ifdef GRID_HIP #ifdef GRID_HIP
hipDeviceProp_t *gpu_props; hipDeviceProp_t *gpu_props;
hipStream_t copyStream; hipStream_t copyStream;
hipStream_t cpuStream; hipStream_t computeStream;
void acceleratorInit(void) void acceleratorInit(void)
{ {
int nDevices = 1; int nDevices = 1;
@ -122,7 +124,8 @@ void acceleratorInit(void)
gpu_props = new hipDeviceProp_t[nDevices]; gpu_props = new hipDeviceProp_t[nDevices];
char * localRankStr = NULL; char * localRankStr = NULL;
int rank = 0, world_rank=0; int rank = 0;
world_rank=0;
// We extract the local rank initialization using an environment variable // We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{ {
@ -183,7 +186,7 @@ void acceleratorInit(void)
#endif #endif
hipSetDevice(device); hipSetDevice(device);
hipStreamCreate(&copyStream); hipStreamCreate(&copyStream);
hipStreamCreate(&cpuStream); hipStreamCreate(&computeStream);
const int len=64; const int len=64;
char busid[len]; char busid[len];
if( rank == world_rank ) { if( rank == world_rank ) {
@ -210,7 +213,8 @@ void acceleratorInit(void)
#endif #endif
char * localRankStr = NULL; char * localRankStr = NULL;
int rank = 0, world_rank=0; int rank = 0;
world_rank=0;
// We extract the local rank initialization using an environment variable // We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)

View File

@ -107,7 +107,7 @@ void acceleratorInit(void);
extern int acceleratorAbortOnGpuError; extern int acceleratorAbortOnGpuError;
extern cudaStream_t copyStream; extern cudaStream_t copyStream;
extern cudaStream_t cpuStream; extern cudaStream_t computeStream;
accelerator_inline int acceleratorSIMTlane(int Nsimd) { accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT #ifdef GRID_SIMT
@ -135,7 +135,7 @@ inline void cuda_mem(void)
}; \ }; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \ dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,nsimd,lambda); \ LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
} }
#define accelerator_for6dNB(iter1, num1, \ #define accelerator_for6dNB(iter1, num1, \
@ -154,7 +154,7 @@ inline void cuda_mem(void)
}; \ }; \
dim3 cu_blocks (num1,num2,num3); \ dim3 cu_blocks (num1,num2,num3); \
dim3 cu_threads(num4,num5,num6); \ dim3 cu_threads(num4,num5,num6); \
Lambda6Apply<<<cu_blocks,cu_threads,0,cpuStream>>>(num1,num2,num3,num4,num5,num6,lambda); \ Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
} }
template<typename lambda> __global__ template<typename lambda> __global__
@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
#define accelerator_barrier(dummy) \ #define accelerator_barrier(dummy) \
{ \ { \
cudaDeviceSynchronize(); \ cudaStreamSynchronize(computeStream); \
cudaError err = cudaGetLastError(); \ cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \ if ( cudaSuccess != err ) { \
printf("accelerator_barrier(): Cuda error %s \n", \ printf("accelerator_barrier(): Cuda error %s \n", \
@ -340,7 +340,7 @@ NAMESPACE_BEGIN(Grid);
#define accelerator_inline __host__ __device__ inline #define accelerator_inline __host__ __device__ inline
extern hipStream_t copyStream; extern hipStream_t copyStream;
extern hipStream_t cpuStream; extern hipStream_t computeStream;
/*These routines define mapping from thread grid to loop & vector lane indexing */ /*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) { accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT #ifdef GRID_SIMT
@ -362,16 +362,15 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \ hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
0,0/*cpuStream*/, \ 0,computeStream, \
num1,num2,nsimd, lambda); \ num1,num2,nsimd, lambda); \
} else { \ } else { \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0/*cpuStream*/, \ 0,computeStream, \
num1,num2,nsimd, lambda); \ num1,num2,nsimd, lambda); \
} \ } \
} }
template<typename lambda> __global__ template<typename lambda> __global__
__launch_bounds__(64,1) __launch_bounds__(64,1)
void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
@ -400,7 +399,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
#define accelerator_barrier(dummy) \ #define accelerator_barrier(dummy) \
{ \ { \
hipStreamSynchronize(cpuStream); \ hipStreamSynchronize(computeStream); \
auto err = hipGetLastError(); \ auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \ if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@ -443,7 +442,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{ {
hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice); hipMemcpyDtoDAsync(to,from,bytes, copyStream);
} }
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); }; inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };

View File

@ -332,9 +332,9 @@ int main(int argc, char **argv) {
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp); OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp);
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp); OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp);
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp); OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp);
Level1.push_back(&StrangePseudoFermionBdy); Level1.push_back(&StrangePseudoFermionBdy); // ok
Level2.push_back(&StrangePseudoFermionLocal); Level2.push_back(&StrangePseudoFermionLocal);
Level1.push_back(&StrangePseudoFermionPVBdy); Level1.push_back(&StrangePseudoFermionPVBdy); //ok
//////////////////////////////////// ////////////////////////////////////
// up down action // up down action
@ -436,6 +436,10 @@ int main(int argc, char **argv) {
*Numerators[h],*Denominators[h], *Numerators[h],*Denominators[h],
*NumeratorsF[h],*DenominatorsF[h], *NumeratorsF[h],*DenominatorsF[h],
OFRp, 500) ); OFRp, 500) );
Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
*Numerators[h],*Denominators[h],
*NumeratorsF[h],*DenominatorsF[h],
OFRp, 500) );
#else #else
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp)); Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp)); Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));