diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 9228b84c..9f6960af 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -498,6 +498,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif + acceleratorFenceComputeStream(); } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} @@ -505,11 +506,13 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif } else if( exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif + acceleratorFenceComputeStream(); } assert(0 && " Kernel optimisation case not covered "); } diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 1e07887b..163e4ac4 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -202,7 +202,8 @@ void acceleratorInit(void) cl::sycl::gpu_selector selector; cl::sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (selectedDevice); - theCopyAccelerator = new sycl::queue (selectedDevice); + // theCopyAccelerator = new sycl::queue (selectedDevice); + theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. #ifdef GRID_SYCL_LEVEL_ZERO_IPC zeInit(0); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index a7fd8db7..c7366a1f 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -247,7 +247,12 @@ inline int acceleratorIsCommunicable(void *ptr) ////////////////////////////////////////////// // SyCL acceleration ////////////////////////////////////////////// - +#ifdef GRID_SYCL +inline void acceleratorFenceComputeStream(void){ accelerator_barrier();}; +#else +// Ordering within a stream guaranteed on Nvidia & AMD +inline void acceleratorFenceComputeStream(void){ }; +#endif #ifdef GRID_SYCL NAMESPACE_END(Grid); #include @@ -299,15 +304,15 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { }); \ }); -#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); ; theGridAccelerator->wait(); } +#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; +inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); } inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} -inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}