From a762b1fb712fa620e5662e4e7d41a94925ecff00 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 21 Oct 2016 09:03:26 +0100 Subject: [PATCH 01/27] MPI3 working with a bounce through shared memory on my laptop. Longer term plan: make the "u_comm_buf" in Stencil point to the shared region and avoid the send between ranks on same node. --- benchmarks/Benchmark_dwf.cc | 7 +- lib/AlignedAllocator.h | 9 - lib/Init.cc | 2 +- lib/Log.cc | 2 +- lib/Threads.h | 17 ++ lib/communicator/Communicator_base.h | 26 ++- lib/communicator/Communicator_mpi.cc | 11 +- lib/communicator/Communicator_mpi3.cc | 250 +++++++++++++++++--------- 8 files changed, 208 insertions(+), 116 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 6a283085..a32d7535 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -208,7 +208,6 @@ int main (int argc, char ** argv) std::cout< 1.0e-6 ) { - std::cout << "site "< &logstreams) { //////////////////////////////////////////////////////////// void Grid_quiesce_nodes(void) { int me = 0; -#ifdef GRID_COMMS_MPI +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) MPI_Comm_rank(MPI_COMM_WORLD, &me); #endif #ifdef GRID_COMMS_SHMEM diff --git a/lib/Threads.h b/lib/Threads.h index d502dd82..70350685 100644 --- a/lib/Threads.h +++ b/lib/Threads.h @@ -31,6 +31,7 @@ Author: paboyle #ifdef _OPENMP #define GRID_OMP +#warning "OpenMP" #endif #define UNROLL _Pragma("unroll") @@ -127,6 +128,22 @@ class GridThread { ThreadBarrier(); }; + static void bcopy(const void *src, void *dst, size_t len) { +#ifdef GRID_OMP +#pragma omp parallel + { + const char *c_src =(char *) src; + char *c_dest=(char *) dst; + int me,mywork,myoff; + GridThread::GetWorkBarrier(len,me, mywork,myoff); + bcopy(&c_src[myoff],&c_dest[myoff],mywork); + } +#else + bcopy(src,dst,len); +#endif + } + + }; } diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 9b5ae8cb..6d6602be 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -45,7 +45,7 @@ class CartesianCommunicator { public: // Communicator should know nothing of the physics grid, only processor grid. - + int _Nprocessors; // How many in all std::vector _processors; // Which dimensions get relayed out over processors lanes. int _processor; // linear processor rank @@ -56,10 +56,13 @@ class CartesianCommunicator { MPI_Comm communicator; typedef MPI_Request CommsRequest_t; #elif GRID_COMMS_MPI3 + int shm_mode; + MPI_Comm communicator; typedef MPI_Request CommsRequest_t; - const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now + const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now + const uint64_t MAX_MPI_SHM_BYTES = 256*1024*1024; // 256MB shared memory for comms enought for 48^4 local vol comms std::vector WorldDims; std::vector GroupDims; @@ -69,14 +72,23 @@ class CartesianCommunicator { std::vector ShmCoor; std::vector WorldCoor; - int GroupRank; - int ShmRank; - int WorldRank; + static std::vector GroupRanks; + static std::vector MyGroup; + static int ShmSetup; + static MPI_Win ShmWindow; + static MPI_Comm ShmComm; - int GroupSize; - int ShmSize; + void * ShmCommBuf; + std::vector ShmCommBufs; + + int WorldRank; int WorldSize; + static int ShmRank; + static int ShmSize; + static int GroupSize; + static int GroupRank; + std::vector LexicographicToWorldRank; #else typedef int CommsRequest_t; diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index dff9811a..9c66202f 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -39,11 +39,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { } } - int Rank(void) { - int pe; - MPI_Comm_rank(MPI_COMM_WORLD,&pe); - return pe; - } +int Rank(void) { + int pe; + MPI_Comm_rank(MPI_COMM_WORLD,&pe); + return pe; +} CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { @@ -168,7 +168,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector & int nreq=list.size(); std::vector status(nreq); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); - assert(ierr==0); } diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 11734fd9..1a36e610 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -30,6 +30,8 @@ Author: Peter Boyle namespace Grid { + + // Global used by Init and nowhere else. How to hide? int Rank(void) { int pe; @@ -76,29 +78,129 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &c rank = LexicographicToWorldRank[rank]; } +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// +int CartesianCommunicator::ShmSetup = 0; +int CartesianCommunicator::ShmRank; +int CartesianCommunicator::ShmSize; +int CartesianCommunicator::GroupRank; +int CartesianCommunicator::GroupSize; +MPI_Comm CartesianCommunicator::ShmComm; +MPI_Win CartesianCommunicator::ShmWindow; +std::vector CartesianCommunicator::GroupRanks; +std::vector CartesianCommunicator::MyGroup; + CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { + _ndimension = processors.size(); - std::cout << "Creating "<< _ndimension << " dim communicator "< world_ranks(WorldSize); + GroupRanks.resize(WorldSize); + MyGroup.resize(ShmSize); + for(int r=0;r()); + int myleader = MyGroup[0]; + + std::vector leaders_1hot(WorldSize,0); + std::vector leaders_group(GroupSize,0); + leaders_1hot [ myleader ] = 1; + + /////////////////////////////////////////////////////////////////// + // global sum leaders over comm world + /////////////////////////////////////////////////////////////////// + ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator); + assert(ierr==0); + + /////////////////////////////////////////////////////////////////// + // find the group leaders world rank + /////////////////////////////////////////////////////////////////// + int group=0; + for(int l=0;l &processors) ShmDims.resize(_ndimension,1); GroupDims.resize(_ndimension); - + ShmCoor.resize(_ndimension); GroupCoor.resize(_ndimension); WorldCoor.resize(_ndimension); @@ -129,12 +231,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) ShmDims[dim]*=2; dim=(dim+1)%_ndimension; } - - std::cout << "Shm group dims "< &processors) for(int d=0;d<_ndimension;d++){ GroupDims[d] = WorldDims[d]/ShmDims[d]; } - std::cout << "Group dims "< world_ranks(WorldSize); - std::vector group_ranks(WorldSize); - std::vector mygroup(GroupSize); - for(int r=0;r &processors) _processors = processors; _processor_coor.resize(_ndimension); for(int i=0;i<_ndimension;i++){ - std::cout << " p " << _processors[i]<()); - int myleader = mygroup[0]; - - std::vector leaders_1hot(WorldSize,0); - std::vector leaders_group(GroupSize,0); - leaders_1hot [ myleader ] = 1; - - /////////////////////////////////////////////////////////////////// - // global sum leaders over comm world - /////////////////////////////////////////////////////////////////// - int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator); - assert(ierr==0); - - /////////////////////////////////////////////////////////////////// - // find the group leaders world rank - /////////////////////////////////////////////////////////////////// - int group=0; - for(int l=0;l &lis { MPI_Request xrq; MPI_Request rrq; + int rank = _processor; int ierr; - ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); - ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); + int tag; + int small = (bytes &list) { int nreq=list.size(); From 306160ad9ad7675774931164e48154806c0ce6aa Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 21 Oct 2016 12:07:28 +0100 Subject: [PATCH 02/27] bcopy threaded --- lib/communicator/Communicator_mpi3.cc | 54 ++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 1a36e610..240aa09e 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -196,10 +196,12 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) } ShmCommBufs.resize(ShmSize); + ShmStencilBufs.resize(ShmSize); for(int r=0;r &lis int from, int bytes) { +#undef SHM_USE_BCOPY MPI_Request xrq; MPI_Request rrq; + static int sequence; + int rank = _processor; int ierr; int tag; - int small = (bytes>2; + assert(((size_t)bytes &0x3)==0); + assert(((size_t)xmit &0x3)==0); + assert(((size_t)recv &0x3)==0); +#endif + assert(gme == ShmRank); if ( small && (dest !=MPI_UNDEFINED) ) { - char *ptr = (char *)ShmCommBufs[gdest]; assert(gme != gdest); - GridThread::bcopy(xmit,ptr,bytes); - bcopy(&_processor,&ptr[bytes],sizeof(_processor)); - bcopy(& sequence,&ptr[bytes+4],sizeof(sequence)); + float *ip = (float *)xmit; + float *op = (float *)to_ptr; + +#ifdef SHM_USE_BCOPY + bcopy(ip,op,bytes); +#else + PARALLEL_FOR_LOOP + for(int w=0;w &lis MPI_Win_sync (ShmWindow); if (small && (from !=MPI_UNDEFINED) ) { - char *ptr = (char *)ShmCommBufs[ShmRank]; - GridThread::bcopy(ptr,recv,bytes); - bcopy(&ptr[bytes] ,&tag ,sizeof(tag)); - bcopy(&ptr[bytes+4],&check,sizeof(check)); + float *ip = (float *)from_ptr; + float *op = (float *)recv; +#ifdef SHM_USE_BCOPY + bcopy(ip,op,bytes); +#else + PARALLEL_FOR_LOOP + for(int w=0;w Date: Fri, 21 Oct 2016 12:12:14 +0100 Subject: [PATCH 03/27] Compile verbose reduce --- lib/Threads.h | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Threads.h b/lib/Threads.h index 70350685..08e5d545 100644 --- a/lib/Threads.h +++ b/lib/Threads.h @@ -31,7 +31,6 @@ Author: paboyle #ifdef _OPENMP #define GRID_OMP -#warning "OpenMP" #endif #define UNROLL _Pragma("unroll") From f331809c2715cd4dc1b0e486c77d8552010a79de Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Fri, 21 Oct 2016 13:35:37 +0100 Subject: [PATCH 04/27] Use variable type for loop --- lib/communicator/Communicator_mpi3.cc | 45 ++++++++++++++++++++------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 240aa09e..7d5ffff3 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -1,3 +1,4 @@ + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -181,6 +182,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) ShmCommBuf = 0; ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow); assert(ierr==0); + MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Verbose for now @@ -368,22 +370,24 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis int small = (bytes>2; - assert(((size_t)bytes &0x3)==0); - assert(((size_t)xmit &0x3)==0); - assert(((size_t)recv &0x3)==0); + typedef double T; + int words = bytes/sizeof(T); + assert(((size_t)bytes &(sizeof(T)-1))==0); + // assert(((size_t)xmit &(sizeof(T)-1))==0); + // assert(((size_t)recv &(sizeof(T)-1))==0); #endif assert(gme == ShmRank); - + + // std::cerr << "proc dest from gme gdest "<<_processor<<" "< &lis list.push_back(xrq); } + // std::cout << "Syncing "< &lis list.push_back(rrq); } + // std::cout << "Syncing"< &list) From fad96cf2507a5a36bfe3fdd7f9dd9bb19be15543 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Fri, 21 Oct 2016 13:36:00 +0100 Subject: [PATCH 05/27] StencilBufs --- lib/communicator/Communicator_base.h | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 6d6602be..f2a6bf00 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -80,6 +80,7 @@ class CartesianCommunicator { void * ShmCommBuf; std::vector ShmCommBufs; + std::vector ShmStencilBufs; int WorldRank; int WorldSize; From 6a9eae6b6b5253fa6372f0a742110517e92b209a Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Fri, 21 Oct 2016 13:36:18 +0100 Subject: [PATCH 06/27] Reporting improvements --- lib/qcd/action/fermion/WilsonFermion5D.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index a96b6fca..7c4fafff 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -194,7 +194,7 @@ void WilsonFermion5D::Report(void) std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; - RealD mflops = 1344*volume*DhopCalls/DhopComputeTime; + RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl; @@ -396,7 +396,6 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { - DhopCalls++; // assert((dag==DaggerNo) ||(dag==DaggerYes)); Compressor compressor(dag); @@ -454,6 +453,7 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, template void WilsonFermion5D::DhopOE(const FermionField &in, FermionField &out,int dag) { + DhopCalls++; conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,out._grid); // drops the cb check @@ -465,6 +465,7 @@ void WilsonFermion5D::DhopOE(const FermionField &in, FermionField &out,int template void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) { + DhopCalls++; conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,out._grid); // drops the cb check @@ -476,6 +477,7 @@ void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int template void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) { + DhopCalls+=2; conformable(in._grid,FermionGrid()); // verifies full grid conformable(in._grid,out._grid); From 09fd5c43a711900e101dc1e3d2a449b8a4f1a93c Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Fri, 21 Oct 2016 15:17:39 +0100 Subject: [PATCH 07/27] Reasonably fast version --- lib/communicator/Communicator_mpi3.cc | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 7d5ffff3..5c7bb09b 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -28,6 +28,7 @@ Author: Peter Boyle /* END LEGAL */ #include "Grid.h" #include +#include namespace Grid { @@ -182,6 +183,15 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) ShmCommBuf = 0; ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow); assert(ierr==0); + for(uint64_t page=0;page &lis char *to_ptr = (char *)ShmCommBufs[gdest]; char *from_ptr = (char *)ShmCommBufs[ShmRank]; - int small = (bytes Date: Fri, 21 Oct 2016 22:34:29 +0100 Subject: [PATCH 08/27] Typo fixes and rotate for CLANG --- lib/simd/Grid_avx512.h | 12 +++++++++++- lib/simd/Intel512avx.h | 2 +- lib/simd/Intel512common.h | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 821898d9..45d6d988 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -41,6 +41,16 @@ Author: paboyle namespace Grid{ namespace Optimization { + + union u512f { + __m512 v; + float f[16]; + }; + + union u512d { + __m512d v; + double f[8]; + }; struct Vsplat{ //Complex float @@ -361,7 +371,7 @@ namespace Optimization { // Some Template specialization // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases -#undef GNU_CLANG_COMPILER +#define GNU_CLANG_COMPILER #ifdef GNU_CLANG_COMPILER //Complex float Reduce template<> diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h index cdd54a33..19157db4 100644 --- a/lib/simd/Intel512avx.h +++ b/lib/simd/Intel512avx.h @@ -53,7 +53,7 @@ Author: paboyle #define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ VSHUFMEMd(O,P,tmp) \ - VMULMEMd(O,P,B,Biirr) \ + VMULMEMd(O,P,B,Biirr) \ VMULMEMd(O,P,C,Ciirr) \ VMULd(tmp,B,Briir) \ VMULd(tmp,C,Criir) diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index dbfb30c2..cfa20c26 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -37,7 +37,7 @@ Author: paboyle // Opcodes common //////////////////////////////////////////////////////////////////////////////////////////////////// #define MASK_REGS \ - __asm__ ("mov $0xAAAA, %%eax \n"\ + __asm__ ("mov $0xAAAA, %%eax \n"\ "kmovw %%eax, %%k6 \n"\ "mov $0x5555, %%eax \n"\ "kmovw %%eax, %%k7 \n" : : : "%eax"); From 910b8dd6a1d89443ed42306439161db09d05d81b Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Fri, 21 Oct 2016 22:35:29 +0100 Subject: [PATCH 09/27] use simd type --- lib/communicator/Communicator_mpi3.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 5c7bb09b..be4e66a9 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -380,7 +380,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis int small = (bytes &lis T *op = (T *)to_ptr; PARALLEL_FOR_LOOP for(int w=0;w &lis T *op = (T *)recv; PARALLEL_FOR_LOOP for(int w=0;w Date: Fri, 21 Oct 2016 22:44:10 +0100 Subject: [PATCH 10/27] Simplify the comms structure prior to implementing Shared memory direct bouncs --- lib/Stencil.h | 168 +++++++++++++++----------------------------------- 1 file changed, 49 insertions(+), 119 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 65de9945..4850425b 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -70,20 +70,20 @@ namespace Grid { -template void -Gather_plane_simple_table_compute (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector >& table) +inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask, + int off,std::vector > & table) { table.resize(0); - int rd = rhs._grid->_rdimensions[dimension]; + int rd = grid->_rdimensions[dimension]; - if ( !rhs._grid->CheckerBoarded(dimension) ) { + if ( !grid->CheckerBoarded(dimension) ) { cbmask = 0x3; } - int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int e1=rhs._grid->_slice_nblock[dimension]; - int e2=rhs._grid->_slice_block[dimension]; + int so= plane*grid->_ostride[dimension]; // base offset for start of plane + int e1=grid->_slice_nblock[dimension]; + int e2=grid->_slice_block[dimension]; - int stride=rhs._grid->_slice_stride[dimension]; + int stride=grid->_slice_stride[dimension]; if ( cbmask == 0x3 ) { table.resize(e1*e2); for(int n=0;n &rhs,commVector &bu for(int n=0;nCheckerBoardFromOindexTable(o+b); + int ocb=1<CheckerBoardFromOindexTable(o+b); if ( ocb &cbmask ) { table[bo]=std::pair(bo,o+b); bo++; } @@ -109,8 +109,7 @@ Gather_plane_simple_table_compute (const Lattice &rhs,commVector &bu } template void -Gather_plane_simple_table (std::vector >& table,const Lattice &rhs,commVector &buffer, - compressor &compress, int off,int so) +Gather_plane_simple_table (std::vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) { PARALLEL_FOR_LOOP for(int i=0;i void -Gather_plane_simple_stencil (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask,compressor &compress, int off, +Gather_plane_simple_stencil (const Lattice &rhs,cobj *buffer,int dimension,int plane,int cbmask,compressor &compress, int off, double &t_table ,double & t_data ) { std::vector > table; - Gather_plane_simple_table_compute (rhs, buffer,dimension,plane,cbmask,compress,off,table); + Gather_plane_simple_table_compute (rhs._grid,dimension,plane,cbmask,off,table); int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane Gather_plane_simple_table (table,rhs,buffer,compress,off,so); } @@ -143,10 +142,11 @@ Gather_plane_simple_stencil (const Lattice &rhs,commVector &buffer,i class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: - typedef uint32_t StencilInteger; - typedef typename cobj::vector_type vector_type; - typedef typename cobj::scalar_type scalar_type; - typedef typename cobj::scalar_object scalar_object; + typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + typedef uint32_t StencilInteger; + typedef typename cobj::vector_type vector_type; + typedef typename cobj::scalar_type scalar_type; + typedef typename cobj::scalar_object scalar_object; ////////////////////////////////////////// // Comms packet queue for asynch thread @@ -158,7 +158,6 @@ Gather_plane_simple_stencil (const Lattice &rhs,commVector &buffer,i Integer to_rank; Integer from_rank; Integer bytes; - volatile Integer done; }; std::vector Packets; @@ -166,81 +165,39 @@ Gather_plane_simple_stencil (const Lattice &rhs,commVector &buffer,i int face_table_computed; std::vector > > face_table ; -#define SEND_IMMEDIATE -#define SERIAL_SENDS - void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ - #ifdef SEND_IMMEDIATE - commtime-=usecond(); - _grid->SendToRecvFrom(xmit,to,rcv,from,bytes); - commtime+=usecond(); - #endif Packet p; p.send_buf = xmit; p.recv_buf = rcv; p.to_rank = to; p.from_rank= from; p.bytes = bytes; - p.done = 0; comms_bytes+=2.0*bytes; Packets.push_back(p); - } - #ifdef SERIAL_SENDS - void Communicate(void ) { - commtime-=usecond(); + void CommunicateBegin(std::vector > &reqs) + { + reqs.resize(Packets.size()); + commtime-=usecond(); for(int i=0;iSendToRecvFrom( - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes); - #endif - Packets[i].done = 1; + _grid->SendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes); } commtime+=usecond(); } - #else - void Communicate(void ) { - typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; - std::vector > reqs(Packets.size()); - commtime-=usecond(); - const int concurrency=2; - for(int i=0;iSendToRecvFromBegin(reqs[j], - Packets[j].send_buf, - Packets[j].to_rank, - Packets[j].recv_buf, - Packets[j].from_rank, - Packets[j].bytes); - #endif - } - } - for(int ii=0;iiSendToRecvFromComplete(reqs[i]); - #endif - } - } - for(int ii=0;ii > &reqs) + { + commtime-=usecond(); + for(int i=0;iSendToRecvFromComplete(reqs[i]); } commtime+=usecond(); } - #endif /////////////////////////////////////////// // Simd merge queue for asynch comms @@ -260,36 +217,19 @@ Gather_plane_simple_stencil (const Lattice &rhs,commVector &buffer,i m.rpointers= rpointers; m.buffer_size = buffer_size; m.packet_id = packet_id; - #ifdef SEND_IMMEDIATE - mergetime-=usecond(); - PARALLEL_FOR_LOOP - for(int o=0;o &rhs,commVector &buffer,i else return cbase + _entries[ent]._byte_offset; } + /////////////////////////////////////////////////////////// // Comms buffers + /////////////////////////////////////////////////////////// std::vector > u_simd_send_buf; std::vector > u_simd_recv_buf; commVector u_send_buf; commVector comm_buf; + int u_comm_offset; int _unified_buffer_size; @@ -483,7 +426,7 @@ Gather_plane_simple_stencil (const Lattice &rhs,commVector &buffer,i } } u_send_buf.resize(_unified_buffer_size); - comm_buf.resize(_unified_buffer_size); + comm_buf.resize(_unified_buffer_size); PrecomputeByteOffsets(); @@ -722,31 +665,16 @@ Gather_plane_simple_stencil (const Lattice &rhs,commVector &buffer,i template void HaloExchange(const Lattice &source,compressor &compress) { + std::vector > reqs; calls++; Mergers.resize(0); Packets.resize(0); HaloGather(source,compress); - this->Communicate(); + this->CommunicateBegin(reqs); + this->CommunicateComplete(reqs); CommsMerge(); // spins } -#if 0 - // Overlapping comms and compute typically slows down compute and is useless - // unless memory bandwidth greatly exceeds network - template - std::thread HaloExchangeBegin(const Lattice &source,compressor &compress) { - Mergers.resize(0); - Packets.resize(0); - HaloGather(source,compress); - return std::thread([&] { this->Communicate(); }); - } - void HaloExchangeComplete(std::thread &thr) - { - CommsMerge(); // spins - jointime-=usecond(); - thr.join(); - jointime+=usecond(); - } -#endif + template void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) { @@ -851,6 +779,9 @@ Gather_plane_simple_stencil (const Lattice &rhs,commVector &buffer,i int sx = (x+sshift)%rd; int comm_proc = ((x+sshift)/rd)%pd; + cobj *u_send_buf_p; + cobj *comm_buf_p; + if (comm_proc) { int words = buffer_size; @@ -863,16 +794,15 @@ Gather_plane_simple_stencil (const Lattice &rhs,commVector &buffer,i if ( !face_table_computed ) { t_table-=usecond(); face_table.resize(face_idx+1); - Gather_plane_simple_table_compute (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,face_table[face_idx]); + cobj *ptr; ptr = &u_send_buf[0]; + Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset, + face_table[face_idx]); t_table+=usecond(); } t_data-=usecond(); - Gather_plane_simple_table (face_table[face_idx],rhs,u_send_buf,compress,u_comm_offset,so); - face_idx++; + Gather_plane_simple_table (face_table[face_idx],rhs,&u_send_buf[0],compress,u_comm_offset,so); face_idx++; t_data+=usecond(); gathertime+=usecond(); - - // Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data); int rank = _grid->_processor; int recv_from_rank; From c190221fd35eba678ec9810aa3b44816085d8283 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Sat, 22 Oct 2016 18:14:27 +0100 Subject: [PATCH 11/27] Internal SHM comms in non-simd directions working Need to fix simd directions --- benchmarks/Benchmark_dwf.cc | 8 +- lib/Stencil.h | 1609 +++++++++--------- lib/communicator/Communicator_base.h | 15 +- lib/communicator/Communicator_mpi.cc | 8 + lib/communicator/Communicator_mpi3.cc | 105 +- lib/communicator/Communicator_none.cc | 8 + lib/communicator/Communicator_shmem.cc | 8 + lib/qcd/action/fermion/FermionOperatorImpl.h | 857 +++++----- lib/qcd/action/fermion/WilsonFermion.cc | 8 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 101 +- lib/qcd/action/fermion/WilsonFermion5D.h | 289 ++-- lib/qcd/action/fermion/WilsonKernels.cc | 21 +- lib/qcd/action/fermion/WilsonKernels.h | 275 ++- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 95 +- lib/qcd/action/fermion/WilsonKernelsHand.cc | 55 +- tests/Test_stencil.cc | 6 +- 16 files changed, 1729 insertions(+), 1739 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index a32d7535..f75f0385 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -153,7 +153,7 @@ int main (int argc, char ** argv) std::cout< // subdir aggregate +const int ShmDirectCopy = 1; + ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient // gather to a point stencil code. CSHIFT is not the best way, so need @@ -68,7 +70,7 @@ // ////////////////////////////////////////////////////////////////////////////////////////// - namespace Grid { +namespace Grid { inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table) @@ -117,822 +119,837 @@ PARALLEL_FOR_LOOP } } -template void -Gather_plane_simple_stencil (const Lattice &rhs,cobj *buffer,int dimension,int plane,int cbmask,compressor &compress, int off, - double &t_table ,double & t_data ) -{ - std::vector > table; - Gather_plane_simple_table_compute (rhs._grid,dimension,plane,cbmask,off,table); - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - Gather_plane_simple_table (table,rhs,buffer,compress,off,so); -} +struct StencilEntry { + uint64_t _offset; + uint64_t _byte_offset; + uint16_t _is_local; + uint16_t _permute; + uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline +}; +template +class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. + public: + typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + typedef uint32_t StencilInteger; + typedef typename cobj::vector_type vector_type; + typedef typename cobj::scalar_type scalar_type; + typedef typename cobj::scalar_object scalar_object; + + ////////////////////////////////////////// + // Comms packet queue for asynch thread + ////////////////////////////////////////// + struct Packet { + void * send_buf; + void * recv_buf; + Integer to_rank; + Integer from_rank; + Integer bytes; + }; - struct StencilEntry { - uint64_t _offset; - uint64_t _byte_offset; - uint16_t _is_local; - uint16_t _permute; - uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline - }; + std::vector Packets; - template - class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. - public: + int face_table_computed; + std::vector > > face_table ; + + void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ + Packet p; + p.send_buf = xmit; + p.recv_buf = rcv; + p.to_rank = to; + p.from_rank= from; + p.bytes = bytes; + comms_bytes+=2.0*bytes; + Packets.push_back(p); + } - typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; - typedef uint32_t StencilInteger; - typedef typename cobj::vector_type vector_type; - typedef typename cobj::scalar_type scalar_type; - typedef typename cobj::scalar_object scalar_object; + void CommunicateBegin(std::vector > &reqs) + { + reqs.resize(Packets.size()); + commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes); + }else{ + _grid->SendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes); + } + } + commtime+=usecond(); + } + void CommunicateComplete(std::vector > &reqs) + { + commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); + else + _grid->SendToRecvFromComplete(reqs[i]); + } + commtime+=usecond(); + } - ////////////////////////////////////////// - // Comms packet queue for asynch thread - ////////////////////////////////////////// + /////////////////////////////////////////// + // Simd merge queue for asynch comms + /////////////////////////////////////////// + struct Merge { + cobj * mpointer; + std::vector rpointers; + Integer buffer_size; + Integer packet_id; + }; + + std::vector Mergers; - struct Packet { - void * send_buf; - void * recv_buf; - Integer to_rank; - Integer from_rank; - Integer bytes; - }; + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { + Merge m; + m.mpointer = merge_p; + m.rpointers= rpointers; + m.buffer_size = buffer_size; + m.packet_id = packet_id; + Mergers.push_back(m); + } - std::vector Packets; + void CommsMerge(void ) { - int face_table_computed; - std::vector > > face_table ; - - void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ - Packet p; - p.send_buf = xmit; - p.recv_buf = rcv; - p.to_rank = to; - p.from_rank= from; - p.bytes = bytes; - comms_bytes+=2.0*bytes; - Packets.push_back(p); - } - - void CommunicateBegin(std::vector > &reqs) - { - reqs.resize(Packets.size()); - commtime-=usecond(); - for(int i=0;iSendToRecvFromBegin(reqs[i], - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes); - } - commtime+=usecond(); - } - void CommunicateComplete(std::vector > &reqs) - { - commtime-=usecond(); - for(int i=0;iSendToRecvFromComplete(reqs[i]); - } - commtime+=usecond(); - } - - /////////////////////////////////////////// - // Simd merge queue for asynch comms - /////////////////////////////////////////// - struct Merge { - cobj * mpointer; - std::vector rpointers; - Integer buffer_size; - Integer packet_id; - }; - - std::vector Mergers; - - void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { - Merge m; - m.mpointer = merge_p; - m.rpointers= rpointers; - m.buffer_size = buffer_size; - m.packet_id = packet_id; - Mergers.push_back(m); - } - - void CommsMerge(void ) { - - for(int i=0;i _directions; + std::vector _distances; + std::vector _comm_buf_size; + std::vector _permute_type; + + // npoints x Osites() of these + // Flat vector, change layout for cache friendly. + Vector _entries; + + inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } + + void PrecomputeByteOffsets(void){ + for(int i=0;i<_entries.size();i++){ + if( _entries[i]._is_local ) { + _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); + } else { + _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); + } + } + }; - int _checkerboard; - int _npoints; // Move to template param? - GridBase * _grid; + inline uint64_t Touch(int ent) { + // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); + } + inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { + uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; + local = _entries[ent]._is_local; + perm = _entries[ent]._permute; + if (perm) ptype = _permute_type[point]; + if (local) { + return base + _entries[ent]._byte_offset; + } else { + return cbase + _entries[ent]._byte_offset; + } + } + inline uint64_t GetPFInfo(int ent,uint64_t base) { + uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; + int local = _entries[ent]._is_local; + if (local) return base + _entries[ent]._byte_offset; + else return cbase + _entries[ent]._byte_offset; + } + + /////////////////////////////////////////////////////////// + // Unified Comms buffers for all directions + /////////////////////////////////////////////////////////// + std::vector > u_simd_send_buf; + std::vector > u_simd_recv_buf; + commVector u_send_buf; + commVector u_recv_buf_hide; + cobj* u_recv_buf_p; - // npoints of these - std::vector _directions; - std::vector _distances; - std::vector _comm_buf_size; - std::vector _permute_type; + int u_comm_offset; + int _unified_buffer_size; + + cobj *CommBuf(void) { return u_recv_buf_p; } - // npoints x Osites() of these - // Flat vector, change layout for cache friendly. - Vector _entries; - - inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } - - void PrecomputeByteOffsets(void){ - for(int i=0;i<_entries.size();i++){ - if( _entries[i]._is_local ) { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); - } else { - // PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952 - _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); - } - } - }; - - inline uint64_t Touch(int ent) { - // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); - } - inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { - uint64_t cbase = (uint64_t)&comm_buf[0]; - local = _entries[ent]._is_local; - perm = _entries[ent]._permute; - if (perm) ptype = _permute_type[point]; - if (local) { - return base + _entries[ent]._byte_offset; - } else { - return cbase + _entries[ent]._byte_offset; - } - } - inline uint64_t GetPFInfo(int ent,uint64_t base) { - uint64_t cbase = (uint64_t)&comm_buf[0]; - int local = _entries[ent]._is_local; - if (local) return base + _entries[ent]._byte_offset; - else return cbase + _entries[ent]._byte_offset; - } - - /////////////////////////////////////////////////////////// - // Comms buffers - /////////////////////////////////////////////////////////// - std::vector > u_simd_send_buf; - std::vector > u_simd_recv_buf; - commVector u_send_buf; - commVector comm_buf; - - int u_comm_offset; - int _unified_buffer_size; - - ///////////////////////////////////////// - // Timing info; ugly; possibly temporary - ///////////////////////////////////////// + ///////////////////////////////////////// + // Timing info; ugly; possibly temporary + ///////////////////////////////////////// #define TIMING_HACK #ifdef TIMING_HACK - double jointime; - double gathertime; - double commtime; - double halogtime; - double mergetime; - double spintime; - double comms_bytes; - double gathermtime; - double splicetime; - double nosplicetime; - double t_data; - double t_table; - double calls; - - void ZeroCounters(void) { - gathertime = 0.; - jointime = 0.; - commtime = 0.; - halogtime = 0.; - mergetime = 0.; - spintime = 0.; - gathermtime = 0.; - splicetime = 0.; - nosplicetime = 0.; - t_data = 0.0; - t_table= 0.0; - comms_bytes = 0.; - calls = 0.; - }; - - void Report(void) { + double jointime; + double gathertime; + double commtime; + double halogtime; + double mergetime; + double spintime; + double comms_bytes; + double gathermtime; + double splicetime; + double nosplicetime; + double t_data; + double t_table; + double calls; + + void ZeroCounters(void) { + gathertime = 0.; + jointime = 0.; + commtime = 0.; + halogtime = 0.; + mergetime = 0.; + spintime = 0.; + gathermtime = 0.; + splicetime = 0.; + nosplicetime = 0.; + t_data = 0.0; + t_table= 0.0; + comms_bytes = 0.; + calls = 0.; + }; + + void Report(void) { #define PRINTIT(A) \ std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls< 0. ) { - std::cout << GridLogMessage << " Stencil calls "<1.0){ - PRINTIT(comms_bytes); - PRINTIT(commtime); - std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "< 0. ) { + std::cout << GridLogMessage << " Stencil calls "<1.0){ + PRINTIT(comms_bytes); + PRINTIT(commtime); + std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "< &directions, - const std::vector &distances) - : _permute_type(npoints), _comm_buf_size(npoints) - { - face_table_computed=0; - _npoints = npoints; - _grid = grid; - _directions = directions; - _distances = distances; - _unified_buffer_size=0; - - int osites = _grid->oSites(); - - _entries.resize(_npoints* osites); - for(int ii=0;ii_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - _permute_type[point]=_grid->PermuteType(dimension); - - _checkerboard = checkerboard; - - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); - int rotate_dim = _grid->_simd_layout[dimension]>2; - - assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported - - int sshift[2]; - - // Underlying approach. For each local site build - // up a table containing the npoint "neighbours" and whether they - // live in lattice or a comms buffer. - if ( !comm_dim ) { - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - - if ( sshift[0] == sshift[1] ) { - Local(point,dimension,shift,0x3); - } else { - Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes - Local(point,dimension,shift,0x2);// both with block stride loop iteration - } - } else { // All permute extract done in comms phase prior to Stencil application - // So tables are the same whether comm_dim or splice_dim - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - - if ( sshift[0] == sshift[1] ) { - Comms(point,dimension,shift,0x3); - } else { - Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes - Comms(point,dimension,shift,0x2);// both with block stride loop iteration - } - } - } - u_send_buf.resize(_unified_buffer_size); - comm_buf.resize(_unified_buffer_size); - - PrecomputeByteOffsets(); - - const int Nsimd = grid->Nsimd(); - u_simd_send_buf.resize(Nsimd); - u_simd_recv_buf.resize(Nsimd); - for(int l=0;l_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int gd = _grid->_gdimensions[dimension]; - int ly = _grid->_simd_layout[dimension]; - - // Map to always positive shift modulo global full dimension. - int shift = (shiftpm+fd)%fd; - - // the permute type - int permute_dim =_grid->PermuteDim(dimension); - - for(int x=0;x_ostride[dimension]; - - int cb= (cbmask==0x2)? Odd : Even; - - int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); - int sx = (x+sshift)%rd; - - int wraparound=0; - if ( (shiftpm==-1) && (sx>x) ) { - wraparound = 1; - } - if ( (shiftpm== 1) && (sxNsimd(); - - int fd = _grid->_fdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int pd = _grid->_processors[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - - assert(comm_dim==1); - int shift = (shiftpm + fd) %fd; - assert(shift>=0); - assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored - - _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and - // send to one or more remote nodes. - - int cb= (cbmask==0x2)? Odd : Even; - int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); - - for(int x=0;xPermuteType(dimension); - - int sx = (x+sshift)%rd; - - int offnode = 0; - if ( simd_layout > 1 ) { - - for(int i=0;i>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - if ( nbr_proc ) { - offnode =1; - } - } - - } else { - int comm_proc = ((x+sshift)/rd)%pd; - offnode = (comm_proc!= 0); - } - - - int wraparound=0; - if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { - wraparound = 1; - } - if ( (shiftpm== 1) && (sx_processor_coor[dimension]==grid->_processors[dimension]-1) ) { - wraparound = 1; - } - if (!offnode) { - - int permute_slice=0; - CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); - - } else { - - int words = buffer_size; - if (cbmask != 0x3) words=words>>1; - - int rank = grid->_processor; - int recv_from_rank; - int xmit_to_rank; - - int unified_buffer_offset = _unified_buffer_size; - _unified_buffer_size += words; - - ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase - - } - } - } - // Routine builds up integer table for each site in _offsets, _is_local, _permute - void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap) - { - int rd = _grid->_rdimensions[dimension]; - - if ( !_grid->CheckerBoarded(dimension) ) { - - int o = 0; // relative offset to base within plane - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*_grid->_ostride[dimension]; // offset in buffer - - // Simple block stride gather of SIMD objects - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - int idx=point+(lo+o+b)*_npoints; - _entries[idx]._offset =ro+o+b; - _entries[idx]._permute=permute; - _entries[idx]._is_local=1; - _entries[idx]._around_the_world=wrap; - } - o +=_grid->_slice_stride[dimension]; - } - - } else { - - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - - int ocb=1<<_grid->CheckerBoardFromOindex(o+b); - - if ( ocb&cbmask ) { - int idx = point+(lo+o+b)*_npoints; - _entries[idx]._offset =ro+o+b; - _entries[idx]._is_local=1; - _entries[idx]._permute=permute; - _entries[idx]._around_the_world=wrap; - } - - } - o +=_grid->_slice_stride[dimension]; - } - - } - } - // Routine builds up integer table for each site in _offsets, _is_local, _permute - void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap) - { - int rd = _grid->_rdimensions[dimension]; - - if ( !_grid->CheckerBoarded(dimension) ) { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - // Simple block stride gather of SIMD objects - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - int idx=point+(so+o+b)*_npoints; - _entries[idx]._offset =offset+(bo++); - _entries[idx]._is_local=0; - _entries[idx]._permute=0; - _entries[idx]._around_the_world=wrap; - } - o +=_grid->_slice_stride[dimension]; - } - - } else { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - - int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup - if ( ocb & cbmask ) { - int idx = point+(so+o+b)*_npoints; - _entries[idx]._offset =offset+(bo++); - _entries[idx]._is_local=0; - _entries[idx]._permute =0; - _entries[idx]._around_the_world=wrap; - } - } - o +=_grid->_slice_stride[dimension]; - } - } - } - - - - template - void HaloExchange(const Lattice &source,compressor &compress) - { - std::vector > reqs; - calls++; - Mergers.resize(0); - Packets.resize(0); - HaloGather(source,compress); - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); - CommsMerge(); // spins - } - - template - void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) - { - int dimension = _directions[point]; - int displacement = _distances[point]; - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - - - // Map to always positive shift modulo global full dimension. - int shift = (displacement+fd)%fd; - - // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); - assert (source.checkerboard== _checkerboard); - - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); - - // Gather phase - int sshift [2]; - if ( comm_dim ) { - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - if ( sshift[0] == sshift[1] ) { - if (splice_dim) { - splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x3,compress,face_idx); - splicetime+=usecond(); - } else { - nosplicetime-=usecond(); - Gather(source,dimension,shift,0x3,compress,face_idx); - nosplicetime+=usecond(); - } - } else { - if(splice_dim){ - splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes - GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration - splicetime+=usecond(); - } else { - nosplicetime-=usecond(); - Gather(source,dimension,shift,0x1,compress,face_idx); - Gather(source,dimension,shift,0x2,compress,face_idx); - nosplicetime+=usecond(); - } - } - } - } - - template - void HaloGather(const Lattice &source,compressor &compress) - { - // conformable(source._grid,_grid); - assert(source._grid==_grid); - halogtime-=usecond(); - - assert (comm_buf.size() == _unified_buffer_size ); - u_comm_offset=0; - - // Gather all comms buffers - int face_idx=0; - for(int point = 0 ; point < _npoints; point++) { - compress.Point(point); - HaloGatherDir(source,compress,point,face_idx); - } - face_table_computed=1; - - assert(u_comm_offset==_unified_buffer_size); - halogtime+=usecond(); - } - - template - void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) - { - typedef typename cobj::vector_type vector_type; - typedef typename cobj::scalar_type scalar_type; - - GridBase *grid=_grid; - assert(rhs._grid==_grid); - // conformable(_grid,rhs._grid); - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int pd = _grid->_processors[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - assert(simd_layout==1); - assert(comm_dim==1); - assert(shift>=0); - assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; - - int cb= (cbmask==0x2)? Odd : Even; - int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); - - for(int x=0;x>1; - - int bytes = words * sizeof(cobj); - - gathertime-=usecond(); - int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane - if ( !face_table_computed ) { - t_table-=usecond(); - face_table.resize(face_idx+1); - cobj *ptr; ptr = &u_send_buf[0]; - Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset, - face_table[face_idx]); - t_table+=usecond(); - } - t_data-=usecond(); - Gather_plane_simple_table (face_table[face_idx],rhs,&u_send_buf[0],compress,u_comm_offset,so); face_idx++; - t_data+=usecond(); - gathertime+=usecond(); - - int rank = _grid->_processor; - int recv_from_rank; - int xmit_to_rank; - _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - assert (xmit_to_rank != _grid->ThisRank()); - assert (recv_from_rank != _grid->ThisRank()); - - // FIXME Implement asynchronous send & also avoid buffer copy - AddPacket((void *)&u_send_buf[u_comm_offset], - (void *) &comm_buf[u_comm_offset], - xmit_to_rank, - recv_from_rank, - bytes); - - u_comm_offset+=words; - } - } - } - - - template - void GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) - { - const int Nsimd = _grid->Nsimd(); - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int pd = _grid->_processors[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - - assert(comm_dim==1); - // This will not work with a rotate dim - assert(simd_layout==2); - assert(shift>=0); - assert(shiftPermuteType(dimension); - - /////////////////////////////////////////////// - // Simd direction uses an extract/merge pair - /////////////////////////////////////////////// - int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; - int words = sizeof(cobj)/sizeof(vector_type); - - assert(cbmask==0x3); // Fixme think there is a latent bug if not true - - int bytes = buffer_size*sizeof(scalar_object); - - std::vector rpointers(Nsimd); - std::vector spointers(Nsimd); - - /////////////////////////////////////////// - // Work out what to send where - /////////////////////////////////////////// - - int cb = (cbmask==0x2)? Odd : Even; - int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); - - // loop over outer coord planes orthog to dim - for(int x=0;x= rd ); - - if ( any_offnode ) { - - for(int i=0;i(rhs,spointers,dimension,sx,cbmask,compress); - gathermtime+=usecond(); - - for(int i=0;i2 - // std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - int nbr_lcoor= (nbr_coor%ld); - int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer - int nbr_ox = (nbr_lcoor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - if (nbr_ic) nbr_lane|=inner_bit; - assert (sx == nbr_ox); - - auto rp = &u_simd_recv_buf[i ][u_comm_offset]; - auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; - - void *vrp = (void *)rp; - void *vsp = (void *)sp; - - - if(nbr_proc){ - - int recv_from_rank; - int xmit_to_rank; - - _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes); - - rpointers[i] = rp; - - } else { - - rpointers[i] = sp; - - } - } - - AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1); - - u_comm_offset +=buffer_size; - } - } - } - - }; - } + CartesianStencil(GridBase *grid, + int npoints, + int checkerboard, + const std::vector &directions, + const std::vector &distances) + : _permute_type(npoints), _comm_buf_size(npoints) + { + face_table_computed=0; + _npoints = npoints; + _grid = grid; + _directions = directions; + _distances = distances; + _unified_buffer_size=0; + + int osites = _grid->oSites(); + + _entries.resize(_npoints* osites); + for(int ii=0;ii_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + _permute_type[point]=_grid->PermuteType(dimension); + + _checkerboard = checkerboard; + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + int rotate_dim = _grid->_simd_layout[dimension]>2; + + assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported + + int sshift[2]; + + // Underlying approach. For each local site build + // up a table containing the npoint "neighbours" and whether they + // live in lattice or a comms buffer. + if ( !comm_dim ) { + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + + if ( sshift[0] == sshift[1] ) { + Local(point,dimension,shift,0x3); + } else { + Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Local(point,dimension,shift,0x2);// both with block stride loop iteration + } + } else { // All permute extract done in comms phase prior to Stencil application + // So tables are the same whether comm_dim or splice_dim + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + + if ( sshift[0] == sshift[1] ) { + Comms(point,dimension,shift,0x3); + } else { + Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Comms(point,dimension,shift,0x2);// both with block stride loop iteration + } + } + } + u_send_buf.resize(_unified_buffer_size); + + ///////////////////////////////////////////////////////////////////////////////// + // Try to allocate for receiving in a shared memory region, fall back to buffer + ///////////////////////////////////////////////////////////////////////////////// + if( ShmDirectCopy ) { + + u_recv_buf_p=(cobj *)_grid->ShmBufferSelf(); + if ( u_recv_buf_p == NULL ) { + u_recv_buf_hide.resize(_unified_buffer_size); + u_recv_buf_p=&u_recv_buf_hide[0]; + } + } else { + u_recv_buf_hide.resize(_unified_buffer_size); + u_recv_buf_p=&u_recv_buf_hide[0]; + } + + PrecomputeByteOffsets(); + + const int Nsimd = grid->Nsimd(); + u_simd_send_buf.resize(Nsimd); + u_simd_recv_buf.resize(Nsimd); + for(int l=0;l_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int gd = _grid->_gdimensions[dimension]; + int ly = _grid->_simd_layout[dimension]; + + // Map to always positive shift modulo global full dimension. + int shift = (shiftpm+fd)%fd; + + // the permute type + int permute_dim =_grid->PermuteDim(dimension); + + for(int x=0;x_ostride[dimension]; + + int cb= (cbmask==0x2)? Odd : Even; + + int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); + int sx = (x+sshift)%rd; + + int wraparound=0; + if ( (shiftpm==-1) && (sx>x) ) { + wraparound = 1; + } + if ( (shiftpm== 1) && (sxNsimd(); + + int fd = _grid->_fdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int pd = _grid->_processors[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(comm_dim==1); + int shift = (shiftpm + fd) %fd; + assert(shift>=0); + assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored + + _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and + // send to one or more remote nodes. + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); + + for(int x=0;xPermuteType(dimension); + + int sx = (x+sshift)%rd; + + int offnode = 0; + if ( simd_layout > 1 ) { + + for(int i=0;i>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + + if ( nbr_proc ) { + offnode =1; + } + } + + } else { + int comm_proc = ((x+sshift)/rd)%pd; + offnode = (comm_proc!= 0); + } + + + int wraparound=0; + if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { + wraparound = 1; + } + if ( (shiftpm== 1) && (sx_processor_coor[dimension]==grid->_processors[dimension]-1) ) { + wraparound = 1; + } + if (!offnode) { + + int permute_slice=0; + CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); + + } else { + + int words = buffer_size; + if (cbmask != 0x3) words=words>>1; + + int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + + int unified_buffer_offset = _unified_buffer_size; + _unified_buffer_size += words; + + ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase + + } + } + } + // Routine builds up integer table for each site in _offsets, _is_local, _permute + void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap) + { + int rd = _grid->_rdimensions[dimension]; + + if ( !_grid->CheckerBoarded(dimension) ) { + + int o = 0; // relative offset to base within plane + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // offset in buffer + + // Simple block stride gather of SIMD objects + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + int idx=point+(lo+o+b)*_npoints; + _entries[idx]._offset =ro+o+b; + _entries[idx]._permute=permute; + _entries[idx]._is_local=1; + _entries[idx]._around_the_world=wrap; + } + o +=_grid->_slice_stride[dimension]; + } + + } else { + + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + + int ocb=1<<_grid->CheckerBoardFromOindex(o+b); + + if ( ocb&cbmask ) { + int idx = point+(lo+o+b)*_npoints; + _entries[idx]._offset =ro+o+b; + _entries[idx]._is_local=1; + _entries[idx]._permute=permute; + _entries[idx]._around_the_world=wrap; + } + + } + o +=_grid->_slice_stride[dimension]; + } + + } + } + // Routine builds up integer table for each site in _offsets, _is_local, _permute + void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap) + { + int rd = _grid->_rdimensions[dimension]; + + if ( !_grid->CheckerBoarded(dimension) ) { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer + + // Simple block stride gather of SIMD objects + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + int idx=point+(so+o+b)*_npoints; + _entries[idx]._offset =offset+(bo++); + _entries[idx]._is_local=0; + _entries[idx]._permute=0; + _entries[idx]._around_the_world=wrap; + } + o +=_grid->_slice_stride[dimension]; + } + + } else { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer + + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + + int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup + if ( ocb & cbmask ) { + int idx = point+(so+o+b)*_npoints; + _entries[idx]._offset =offset+(bo++); + _entries[idx]._is_local=0; + _entries[idx]._permute =0; + _entries[idx]._around_the_world=wrap; + } + } + o +=_grid->_slice_stride[dimension]; + } + } + } + + template void HaloExchange(const Lattice &source,compressor &compress) + { + std::vector > reqs; + calls++; + Mergers.resize(0); + Packets.resize(0); + HaloGather(source,compress); + this->CommunicateBegin(reqs); + this->CommunicateComplete(reqs); + CommsMerge(); // spins + } + + template void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) + { + int dimension = _directions[point]; + int displacement = _distances[point]; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + + + // Map to always positive shift modulo global full dimension. + int shift = (displacement+fd)%fd; + + // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); + assert (source.checkerboard== _checkerboard); + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + // Gather phase + int sshift [2]; + if ( comm_dim ) { + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + if ( sshift[0] == sshift[1] ) { + if (splice_dim) { + splicetime-=usecond(); + GatherSimd(source,dimension,shift,0x3,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + Gather(source,dimension,shift,0x3,compress,face_idx); + nosplicetime+=usecond(); + } + } else { + if(splice_dim){ + splicetime-=usecond(); + GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes + GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + Gather(source,dimension,shift,0x1,compress,face_idx); + Gather(source,dimension,shift,0x2,compress,face_idx); + nosplicetime+=usecond(); + } + } + } + } + + template + void HaloGather(const Lattice &source,compressor &compress) + { + // conformable(source._grid,_grid); + assert(source._grid==_grid); + halogtime-=usecond(); + + u_comm_offset=0; + + // Gather all comms buffers + int face_idx=0; + for(int point = 0 ; point < _npoints; point++) { + compress.Point(point); + HaloGatherDir(source,compress,point,face_idx); + } + face_table_computed=1; + + assert(u_comm_offset==_unified_buffer_size); + halogtime+=usecond(); + } + + template + void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) + { + typedef typename cobj::vector_type vector_type; + typedef typename cobj::scalar_type scalar_type; + + GridBase *grid=_grid; + assert(rhs._grid==_grid); + // conformable(_grid,rhs._grid); + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int pd = _grid->_processors[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + + for(int x=0;x>1; + + int bytes = words * sizeof(cobj); + + gathertime-=usecond(); + int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane + if ( !face_table_computed ) { + t_table-=usecond(); + face_table.resize(face_idx+1); + Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset, + face_table[face_idx]); + t_table+=usecond(); + } + + + int rank = _grid->_processor; + int recv_from_rank; + int xmit_to_rank; + _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + + assert (xmit_to_rank != _grid->ThisRank()); + assert (recv_from_rank != _grid->ThisRank()); + + ///////////////////////////////////////////////////////// + // try the direct copy if possible + ///////////////////////////////////////////////////////// + + cobj *u_send_buf_p = &u_send_buf[0]; + if (ShmDirectCopy) { + cobj *shm = (cobj *) _grid->ShmBuffer(xmit_to_rank); + if ( shm!=NULL) { + u_send_buf_p = shm; + } + } + + t_data-=usecond(); + Gather_plane_simple_table (face_table[face_idx],rhs,u_send_buf_p,compress,u_comm_offset,so); face_idx++; + t_data+=usecond(); + + AddPacket((void *)&u_send_buf_p[u_comm_offset], + (void *)&u_recv_buf_p[u_comm_offset], + xmit_to_rank, + recv_from_rank, + bytes); + + gathertime+=usecond(); + u_comm_offset+=words; + } + } + } + + template + void GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) + { + const int Nsimd = _grid->Nsimd(); + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int pd = _grid->_processors[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(comm_dim==1); + // This will not work with a rotate dim + assert(simd_layout==2); + assert(shift>=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; + int words = sizeof(cobj)/sizeof(vector_type); + + assert(cbmask==0x3); // Fixme think there is a latent bug if not true + + int bytes = buffer_size*sizeof(scalar_object); + + std::vector rpointers(Nsimd); + std::vector spointers(Nsimd); + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + + int cb = (cbmask==0x2)? Odd : Even; + int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + + // loop over outer coord planes orthog to dim + for(int x=0;x= rd ); + + if ( any_offnode ) { + + for(int i=0;i(rhs,spointers,dimension,sx,cbmask,compress); + gathermtime+=usecond(); + + for(int i=0;i2 + // std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + int nbr_lcoor= (nbr_coor%ld); + int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer + int nbr_ox = (nbr_lcoor%rd); // outer coord of peer + int nbr_lane = (i&(~inner_bit)); + + if (nbr_ic) nbr_lane|=inner_bit; + assert (sx == nbr_ox); + + auto rp = &u_simd_recv_buf[i ][u_comm_offset]; + auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; + + if(nbr_proc){ + + int recv_from_rank; + int xmit_to_rank; + + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); + + rpointers[i] = rp; + + } else { + + rpointers[i] = sp; + + } + } + + assert(0); + AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1); + + u_comm_offset +=buffer_size; + } + } + } + +}; +} #endif diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index f2a6bf00..4139f72a 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -80,7 +80,6 @@ class CartesianCommunicator { void * ShmCommBuf; std::vector ShmCommBufs; - std::vector ShmStencilBufs; int WorldRank; int WorldSize; @@ -105,6 +104,10 @@ class CartesianCommunicator { int RankFromProcessorCoor(std::vector &coor); void ProcessorCoorFromRank(int rank,std::vector &coor); + // Helper function for SHM Windows in MPI3 + void *ShmBufferSelf(void); + void *ShmBuffer(int rank); + ///////////////////////////////// // Grid information queries ///////////////////////////////// @@ -173,6 +176,16 @@ class CartesianCommunicator { int recv_from_rank, int bytes); void SendToRecvFromComplete(std::vector &waitall); + void StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes); + void StencilSendToRecvFromComplete(std::vector &waitall) + { + SendToRecvFromComplete(waitall); + } //////////////////////////////////////////////////////////// // Barrier diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 9c66202f..4291b319 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -67,6 +67,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) assert(Size==_Nprocessors); } +void *CartesianCommunicator::ShmBufferSelf(void) +{ + return NULL; +} +void *CartesianCommunicator::ShmBuffer(int rank) +{ + return NULL; +} void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index be4e66a9..f5bbdbda 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -197,10 +197,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Verbose for now ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - std::cout<< "Ranks per node "<< ShmSize << std::endl; - std::cout<< "Nodes "<< GroupSize << std::endl; - std::cout<< "Ranks "<< WorldSize << std::endl; - std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl; + std::cout< &processors) } ShmCommBufs.resize(ShmSize); - ShmStencilBufs.resize(ShmSize); for(int r=0;r &processors) ShmCoor.resize(_ndimension); GroupCoor.resize(_ndimension); WorldCoor.resize(_ndimension); + for(int l2=0;l2 &list, void *xmit, @@ -355,13 +369,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis int from, int bytes) { -#undef SHM_USE_BCOPY MPI_Request xrq; MPI_Request rrq; static int sequence; - int rank = _processor; int ierr; int tag; int check; @@ -370,6 +382,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis assert(from != _processor); int gdest = GroupRanks[dest]; + int gfrom = GroupRanks[from]; int gme = GroupRanks[_processor]; sequence++; @@ -379,30 +392,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis int small = (bytes &lis list.push_back(xrq); } - // std::cout << "Syncing "< &lis assert(ierr==0); list.push_back(rrq); } - - // std::cout << "Syncing"< &list, + void *xmit, + int dest, + void *recv, + int from, + int bytes) +{ MPI_Request xrq; MPI_Request rrq; - int rank = _processor; - int ierr; - ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); - ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); - - assert(ierr==0); - list.push_back(xrq); - list.push_back(rrq); -#endif + int ierr; + + assert(dest != _processor); + assert(from != _processor); + + int gdest = GroupRanks[dest]; + int gfrom = GroupRanks[from]; + int gme = GroupRanks[_processor]; + + assert(gme == ShmRank); + + if ( gdest == MPI_UNDEFINED ) { + ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); + assert(ierr==0); + list.push_back(xrq); + } + + if ( gfrom ==MPI_UNDEFINED) { + ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); + assert(ierr==0); + list.push_back(rrq); + } + + MPI_Win_sync (ShmWindow); + MPI_Barrier (ShmComm); + MPI_Win_sync (ShmWindow); + } + void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) { int nreq=list.size(); diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index 8601255a..80b8fb90 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -33,6 +33,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv) } int Rank(void ){ return 0; }; +void *CartesianCommunicator::ShmBufferSelf(void) +{ + return NULL; +} +void *CartesianCommunicator::ShmBuffer(int rank) +{ + return NULL; +} CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc index 091e266e..4af719b0 100644 --- a/lib/communicator/Communicator_shmem.cc +++ b/lib/communicator/Communicator_shmem.cc @@ -50,6 +50,14 @@ typedef struct HandShake_t { static Vector< HandShake > XConnections; static Vector< HandShake > RConnections; +void *CartesianCommunicator::ShmBufferSelf(void) +{ + return NULL; +} +void *CartesianCommunicator::ShmBuffer(int rank) +{ + return NULL; +} void CartesianCommunicator::Init(int *argc, char ***argv) { shmem_init(); XConnections.resize(shmem_n_pes()); diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 4248ec4c..0800dea6 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -33,511 +33,500 @@ directory #define GRID_QCD_FERMION_OPERATOR_IMPL_H namespace Grid { - - namespace QCD { +namespace QCD { - ////////////////////////////////////////////// - // Template parameter class constructs to package - // externally control Fermion implementations - // in orthogonal directions - // - // Ultimately need Impl to always define types where XXX is opaque - // - // typedef typename XXX Simd; - // typedef typename XXX GaugeLinkField; - // typedef typename XXX GaugeField; - // typedef typename XXX GaugeActField; - // typedef typename XXX FermionField; - // typedef typename XXX DoubledGaugeField; - // typedef typename XXX SiteSpinor; - // typedef typename XXX SiteHalfSpinor; - // typedef typename XXX Compressor; - // - // and Methods: - // void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) - // void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) - // void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St) - // void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) - // void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) - // - // - // To acquire the typedefs from "Base" (either a base class or template param) use: - // - // INHERIT_GIMPL_TYPES(Base) - // INHERIT_FIMPL_TYPES(Base) - // INHERIT_IMPL_TYPES(Base) - // - // The Fermion operators will do the following: - // - // struct MyOpParams { - // RealD mass; - // }; - // - // - // template - // class MyOp : public { - // public: - // - // INHERIT_ALL_IMPL_TYPES(Impl); - // - // MyOp(MyOpParams Myparm, ImplParams &ImplParam) : Impl(ImplParam) - // { - // - // }; - // - // } - ////////////////////////////////////////////// - - - //////////////////////////////////////////////////////////////////////// - // Implementation dependent fermion types - //////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////// + // Template parameter class constructs to package + // externally control Fermion implementations + // in orthogonal directions + // + // Ultimately need Impl to always define types where XXX is opaque + // + // typedef typename XXX Simd; + // typedef typename XXX GaugeLinkField; + // typedef typename XXX GaugeField; + // typedef typename XXX GaugeActField; + // typedef typename XXX FermionField; + // typedef typename XXX DoubledGaugeField; + // typedef typename XXX SiteSpinor; + // typedef typename XXX SiteHalfSpinor; + // typedef typename XXX Compressor; + // + // and Methods: + // void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) + // void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) + // void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St) + // void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) + // void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) + // + // + // To acquire the typedefs from "Base" (either a base class or template param) use: + // + // INHERIT_GIMPL_TYPES(Base) + // INHERIT_FIMPL_TYPES(Base) + // INHERIT_IMPL_TYPES(Base) + // + // The Fermion operators will do the following: + // + // struct MyOpParams { + // RealD mass; + // }; + // + // + // template + // class MyOp : public { + // public: + // + // INHERIT_ALL_IMPL_TYPES(Impl); + // + // MyOp(MyOpParams Myparm, ImplParams &ImplParam) : Impl(ImplParam) + // { + // + // }; + // + // } + ////////////////////////////////////////////// + + //////////////////////////////////////////////////////////////////////// + // Implementation dependent fermion types + //////////////////////////////////////////////////////////////////////// + #define INHERIT_FIMPL_TYPES(Impl)\ - typedef typename Impl::FermionField FermionField; \ - typedef typename Impl::DoubledGaugeField DoubledGaugeField; \ - typedef typename Impl::SiteSpinor SiteSpinor; \ - typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \ - typedef typename Impl::Compressor Compressor; \ - typedef typename Impl::StencilImpl StencilImpl; \ - typedef typename Impl::ImplParams ImplParams; \ - typedef typename Impl::Coeff_t Coeff_t; - + typedef typename Impl::FermionField FermionField; \ + typedef typename Impl::DoubledGaugeField DoubledGaugeField; \ + typedef typename Impl::SiteSpinor SiteSpinor; \ + typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \ + typedef typename Impl::Compressor Compressor; \ + typedef typename Impl::StencilImpl StencilImpl; \ + typedef typename Impl::ImplParams ImplParams; \ + typedef typename Impl::Coeff_t Coeff_t; + #define INHERIT_IMPL_TYPES(Base) \ - INHERIT_GIMPL_TYPES(Base) \ - INHERIT_FIMPL_TYPES(Base) + INHERIT_GIMPL_TYPES(Base) \ + INHERIT_FIMPL_TYPES(Base) + + ///////////////////////////////////////////////////////////////////////////// + // Single flavour four spinors with colour index + ///////////////////////////////////////////////////////////////////////////// + template + class WilsonImpl : public PeriodicGaugeImpl > { + + public: + + static const int Dimension = Representation::Dimension; + typedef PeriodicGaugeImpl > Gimpl; + + //Necessary? + constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - /////// - // Single flavour four spinors with colour index - /////// - template - class WilsonImpl - : public PeriodicGaugeImpl > { - public: - static const int Dimension = Representation::Dimension; - typedef PeriodicGaugeImpl > Gimpl; - - //Necessary? - constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} + const bool LsVectorised=false; + typedef _Coeff_t Coeff_t; - const bool LsVectorised=false; - typedef _Coeff_t Coeff_t; - - - INHERIT_GIMPL_TYPES(Gimpl); + INHERIT_GIMPL_TYPES(Gimpl); - template using iImplSpinor = iScalar, Ns> >; - template using iImplHalfSpinor = iScalar, Nhs> >; - template using iImplDoubledGaugeField = iVector >, Nds>; + template using iImplSpinor = iScalar, Ns> >; + template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplDoubledGaugeField = iVector >, Nds>; + + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplDoubledGaugeField SiteDoubledGaugeField; + + typedef Lattice FermionField; + typedef Lattice DoubledGaugeField; + + typedef WilsonCompressor Compressor; + typedef WilsonImplParams ImplParams; + typedef WilsonStencil StencilImpl; + + ImplParams Params; + + WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; - typedef iImplSpinor SiteSpinor; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef iImplDoubledGaugeField SiteDoubledGaugeField; + bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - typedef Lattice FermionField; - typedef Lattice DoubledGaugeField; + inline void multLink(SiteHalfSpinor &phi, + const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, + int mu, + StencilEntry *SE, + StencilImpl &St) { + mult(&phi(), &U(mu), &chi()); + } - typedef WilsonCompressor Compressor; - typedef WilsonImplParams ImplParams; - typedef WilsonStencil StencilImpl; + template + inline void loadLinkElement(Simd ®, ref &memory) { + reg = memory; + } - ImplParams Params; - - WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; - - bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - - inline void multLink(SiteHalfSpinor &phi, - const SiteDoubledGaugeField &U, - const SiteHalfSpinor &chi, - int mu, - StencilEntry *SE, - StencilImpl &St) { - mult(&phi(), &U(mu), &chi()); + inline void DoubleStore(GridBase *GaugeGrid, + DoubledGaugeField &Uds, + const GaugeField &Umu) { + conformable(Uds._grid, GaugeGrid); + conformable(Umu._grid, GaugeGrid); + GaugeLinkField U(GaugeGrid); + for (int mu = 0; mu < Nd; mu++) { + U = PeekIndex(Umu, mu); + PokeIndex(Uds, U, mu); + U = adj(Cshift(U, mu, -1)); + PokeIndex(Uds, U, mu + 4); } + } + + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ + GaugeLinkField link(mat._grid); + link = TraceIndex(outerProduct(Btilde,A)); + PokeIndex(mat,link,mu); + } - template - inline void loadLinkElement(Simd ®, - ref &memory) { - reg = memory; - } + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ - inline void DoubleStore(GridBase *GaugeGrid, - DoubledGaugeField &Uds, - const GaugeField &Umu) { - conformable(Uds._grid, GaugeGrid); - conformable(Umu._grid, GaugeGrid); - GaugeLinkField U(GaugeGrid); - for (int mu = 0; mu < Nd; mu++) { - U = PeekIndex(Umu, mu); - PokeIndex(Uds, U, mu); - U = adj(Cshift(U, mu, -1)); - PokeIndex(Uds, U, mu + 4); + int Ls=Btilde._grid->_fdimensions[0]; + GaugeLinkField tmp(mat._grid); + tmp = zero; + + PARALLEL_FOR_LOOP + for(int sss=0;sssoSites();sss++){ + int sU=sss; + for(int s=0;s(outerProduct(Btilde[sF],Atilde[sF])); // ordering here } } - - inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ - GaugeLinkField link(mat._grid); - link = TraceIndex(outerProduct(Btilde,A)); - PokeIndex(mat,link,mu); - } + PokeIndex(mat,tmp,mu); - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ - - int Ls=Btilde._grid->_fdimensions[0]; - GaugeLinkField tmp(mat._grid); - tmp = zero; + } + }; - PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - int sU=sss; - for(int s=0;s(outerProduct(Btilde[sF],Atilde[sF])); // ordering here - } - } - PokeIndex(mat,tmp,mu); + //////////////////////////////////////////////////////////////////////////////////// + // Single flavour four spinors with colour index, 5d redblack + //////////////////////////////////////////////////////////////////////////////////// +template +class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { + public: + + static const int Dimension = Nrepresentation; + const bool LsVectorised=true; + typedef _Coeff_t Coeff_t; + typedef PeriodicGaugeImpl > Gimpl; + + INHERIT_GIMPL_TYPES(Gimpl); + + template using iImplSpinor = iScalar, Ns> >; + template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplDoubledGaugeField = iVector >, Nds>; + template using iImplGaugeField = iVector >, Nd>; + template using iImplGaugeLink = iScalar > >; + + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef Lattice FermionField; + + // Make the doubled gauge field a *scalar* + typedef iImplDoubledGaugeField SiteDoubledGaugeField; // This is a scalar + typedef iImplGaugeField SiteScalarGaugeField; // scalar + typedef iImplGaugeLink SiteScalarGaugeLink; // scalar + + typedef Lattice DoubledGaugeField; + + typedef WilsonCompressor Compressor; + typedef WilsonImplParams ImplParams; + typedef WilsonStencil StencilImpl; + + ImplParams Params; + + DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){}; + + bool overlapCommsCompute(void) { return false; }; + + template + inline void loadLinkElement(Simd ®, ref &memory) { + vsplat(reg, memory); + } + + inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, int mu, StencilEntry *SE, + StencilImpl &St) { + SiteGaugeLink UU; + for (int i = 0; i < Nrepresentation; i++) { + for (int j = 0; j < Nrepresentation; j++) { + vsplat(UU()()(i, j), U(mu)()(i, j)); } - }; - - /////// - // Single flavour four spinors with colour index, 5d redblack - /////// - template - class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { - public: + } + mult(&phi(), &UU(), &chi()); + } - static const int Dimension = Nrepresentation; - const bool LsVectorised=true; - typedef _Coeff_t Coeff_t; - typedef PeriodicGaugeImpl > Gimpl; - - INHERIT_GIMPL_TYPES(Gimpl); + inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) + { + SiteScalarGaugeField ScalarUmu; + SiteDoubledGaugeField ScalarUds; + + GaugeLinkField U(Umu._grid); + GaugeField Uadj(Umu._grid); + for (int mu = 0; mu < Nd; mu++) { + U = PeekIndex(Umu, mu); + U = adj(Cshift(U, mu, -1)); + PokeIndex(Uadj, U, mu); + } + + for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { + std::vector lcoor; + GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - template using iImplSpinor = iScalar, Ns> >; - template using iImplHalfSpinor = iScalar, Nhs> >; - template using iImplDoubledGaugeField = iVector >, Nds>; - template using iImplGaugeField = iVector >, Nd>; - template using iImplGaugeLink = iScalar > >; + peekLocalSite(ScalarUmu, Umu, lcoor); + for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); - typedef iImplSpinor SiteSpinor; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef Lattice FermionField; + peekLocalSite(ScalarUmu, Uadj, lcoor); + for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); - // Make the doubled gauge field a *scalar* - typedef iImplDoubledGaugeField - SiteDoubledGaugeField; // This is a scalar - typedef iImplGaugeField - SiteScalarGaugeField; // scalar - typedef iImplGaugeLink - SiteScalarGaugeLink; // scalar + pokeLocalSite(ScalarUds, Uds, lcoor); + } + } - typedef Lattice DoubledGaugeField; + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) + { + assert(0); + } - typedef WilsonCompressor Compressor; - typedef WilsonImplParams ImplParams; - typedef WilsonStencil StencilImpl; - - ImplParams Params; - - DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){}; - - bool overlapCommsCompute(void) { return false; }; - - template - inline void loadLinkElement(Simd ®, ref &memory) { - vsplat(reg, memory); - } - inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, - const SiteHalfSpinor &chi, int mu, StencilEntry *SE, - StencilImpl &St) { - SiteGaugeLink UU; - for (int i = 0; i < Nrepresentation; i++) { - for (int j = 0; j < Nrepresentation; j++) { - vsplat(UU()()(i, j), U(mu)()(i, j)); - } - } - mult(&phi(), &UU(), &chi()); - } - - inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds, - const GaugeField &Umu) { - SiteScalarGaugeField ScalarUmu; - SiteDoubledGaugeField ScalarUds; - - GaugeLinkField U(Umu._grid); - GaugeField Uadj(Umu._grid); - for (int mu = 0; mu < Nd; mu++) { - U = PeekIndex(Umu, mu); - U = adj(Cshift(U, mu, -1)); - PokeIndex(Uadj, U, mu); - } - - for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { - std::vector lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - - peekLocalSite(ScalarUmu, Umu, lcoor); - for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); - - peekLocalSite(ScalarUmu, Uadj, lcoor); - for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); - - pokeLocalSite(ScalarUds, Uds, lcoor); - } - } - - inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, - FermionField &A, int mu) { + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField Ã, int mu) + { assert(0); - } - - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, - FermionField Ã, int mu) { - assert(0); - } - }; + } +}; //////////////////////////////////////////////////////////////////////////////////////// // Flavour doubled spinors; is Gparity the only? what about C*? //////////////////////////////////////////////////////////////////////////////////////// - template - class GparityWilsonImpl - : public ConjugateGaugeImpl > { - public: - static const int Dimension = Nrepresentation; +template +class GparityWilsonImpl : public ConjugateGaugeImpl > { + public: - const bool LsVectorised=false; + static const int Dimension = Nrepresentation; - typedef _Coeff_t Coeff_t; - typedef ConjugateGaugeImpl< GaugeImplTypes > Gimpl; + const bool LsVectorised=false; + + typedef _Coeff_t Coeff_t; + typedef ConjugateGaugeImpl< GaugeImplTypes > Gimpl; + + INHERIT_GIMPL_TYPES(Gimpl); - INHERIT_GIMPL_TYPES(Gimpl); + template using iImplSpinor = iVector, Ns>, Ngp>; + template using iImplHalfSpinor = iVector, Nhs>, Ngp>; + template using iImplDoubledGaugeField = iVector >, Nds>, Ngp>; - template - using iImplSpinor = - iVector, Ns>, Ngp>; - template - using iImplHalfSpinor = - iVector, Nhs>, Ngp>; - template - using iImplDoubledGaugeField = - iVector >, Nds>, Ngp>; + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplDoubledGaugeField SiteDoubledGaugeField; + + typedef Lattice FermionField; + typedef Lattice DoubledGaugeField; + + typedef WilsonCompressor Compressor; + typedef WilsonStencil StencilImpl; + + typedef GparityWilsonImplParams ImplParams; - typedef iImplSpinor SiteSpinor; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef iImplDoubledGaugeField SiteDoubledGaugeField; - - typedef Lattice FermionField; - typedef Lattice DoubledGaugeField; - - typedef WilsonCompressor Compressor; - typedef WilsonStencil StencilImpl; + ImplParams Params; - typedef GparityWilsonImplParams ImplParams; - - ImplParams Params; + GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; + bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; + // provide the multiply by link that is differentiated between Gparity (with + // flavour index) and non-Gparity + inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, int mu, StencilEntry *SE, + StencilImpl &St) { - bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - - // provide the multiply by link that is differentiated between Gparity (with - // flavour index) and non-Gparity - inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, - const SiteHalfSpinor &chi, int mu, StencilEntry *SE, - StencilImpl &St) { - typedef SiteHalfSpinor vobj; - typedef typename SiteHalfSpinor::scalar_object sobj; + typedef SiteHalfSpinor vobj; + typedef typename SiteHalfSpinor::scalar_object sobj; - vobj vtmp; - sobj stmp; + vobj vtmp; + sobj stmp; - GridBase *grid = St._grid; + GridBase *grid = St._grid; - const int Nsimd = grid->Nsimd(); + const int Nsimd = grid->Nsimd(); - int direction = St._directions[mu]; - int distance = St._distances[mu]; - int ptype = St._permute_type[mu]; - int sl = St._grid->_simd_layout[direction]; + int direction = St._directions[mu]; + int distance = St._distances[mu]; + int ptype = St._permute_type[mu]; + int sl = St._grid->_simd_layout[direction]; + + // Fixme X.Y.Z.T hardcode in stencil + int mmu = mu % Nd; - // Fixme X.Y.Z.T hardcode in stencil - int mmu = mu % Nd; + // assert our assumptions + assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code + assert((sl == 1) || (sl == 2)); + + std::vector icoor; - // assert our assumptions - assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code - assert((sl == 1) || (sl == 2)); - - std::vector icoor; - - if ( SE->_around_the_world && Params.twists[mmu] ) { + if ( SE->_around_the_world && Params.twists[mmu] ) { - if ( sl == 2 ) { + if ( sl == 2 ) { + + std::vector vals(Nsimd); - std::vector vals(Nsimd); + extract(chi,vals); + for(int s=0;siCoorFromIindex(icoor,s); + grid->iCoorFromIindex(icoor,s); - assert((icoor[direction]==0)||(icoor[direction]==1)); + assert((icoor[direction]==0)||(icoor[direction]==1)); - int permute_lane; - if ( distance == 1) { - permute_lane = icoor[direction]?1:0; - } else { - permute_lane = icoor[direction]?0:1; + int permute_lane; + if ( distance == 1) { + permute_lane = icoor[direction]?1:0; + } else { + permute_lane = icoor[direction]?0:1; + } + + if ( permute_lane ) { + stmp(0) = vals[s](1); + stmp(1) = vals[s](0); + vals[s] = stmp; } - - if ( permute_lane ) { - stmp(0) = vals[s](1); - stmp(1) = vals[s](0); - vals[s] = stmp; - } - } - merge(vtmp,vals); + } + merge(vtmp,vals); + + } else { + vtmp(0) = chi(1); + vtmp(1) = chi(0); + } + mult(&phi(0),&U(0)(mu),&vtmp(0)); + mult(&phi(1),&U(1)(mu),&vtmp(1)); + + } else { + mult(&phi(0),&U(0)(mu),&chi(0)); + mult(&phi(1),&U(1)(mu),&chi(1)); + } + + } - } else { - vtmp(0) = chi(1); - vtmp(1) = chi(0); - } - mult(&phi(0),&U(0)(mu),&vtmp(0)); - mult(&phi(1),&U(1)(mu),&vtmp(1)); - - } else { - mult(&phi(0),&U(0)(mu),&chi(0)); - mult(&phi(1),&U(1)(mu),&chi(1)); - } + inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) + { + conformable(Uds._grid,GaugeGrid); + conformable(Umu._grid,GaugeGrid); + + GaugeLinkField Utmp (GaugeGrid); + GaugeLinkField U (GaugeGrid); + GaugeLinkField Uconj(GaugeGrid); + + Lattice > coor(GaugeGrid); - } - - inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) - { - - conformable(Uds._grid,GaugeGrid); - conformable(Umu._grid,GaugeGrid); - - GaugeLinkField Utmp (GaugeGrid); - GaugeLinkField U (GaugeGrid); - GaugeLinkField Uconj(GaugeGrid); - - Lattice > coor(GaugeGrid); - - - for(int mu=0;mu(Umu,mu); - Uconj = conjugate(U); + U = PeekIndex(Umu,mu); + Uconj = conjugate(U); + + // This phase could come from a simple bc 1,1,-1,1 .. + int neglink = GaugeGrid->GlobalDimensions()[mu]-1; + if ( Params.twists[mu] ) { + Uconj = where(coor==neglink,-Uconj,Uconj); + } - // This phase could come from a simple bc 1,1,-1,1 .. - int neglink = GaugeGrid->GlobalDimensions()[mu]-1; - if ( Params.twists[mu] ) { - Uconj = where(coor==neglink,-Uconj,Uconj); - } +PARALLEL_FOR_LOOP + for(auto ss=U.begin();ss(outerProduct(Btilde, A)); - PARALLEL_FOR_LOOP - for (auto ss = tmp.begin(); ss < tmp.end(); ss++) { - link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1)); - } - PokeIndex(mat, link, mu); - return; - } + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) { + + // DhopDir provides U or Uconj depending on coor/flavour. + GaugeLinkField link(mat._grid); + // use lorentz for flavour as hack. + auto tmp = TraceIndex(outerProduct(Btilde, A)); +PARALLEL_FOR_LOOP + for (auto ss = tmp.begin(); ss < tmp.end(); ss++) { + link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1)); + } + PokeIndex(mat, link, mu); + return; + } - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, - FermionField Ã, int mu) { - int Ls = Btilde._grid->_fdimensions[0]; + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) { + + int Ls = Btilde._grid->_fdimensions[0]; - GaugeLinkField tmp(mat._grid); - tmp = zero; - PARALLEL_FOR_LOOP - for (int ss = 0; ss < tmp._grid->oSites(); ss++) { - for (int s = 0; s < Ls; s++) { - int sF = s + Ls * ss; - auto ttmp = traceIndex(outerProduct(Btilde[sF], Atilde[sF])); - tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); - } - } - PokeIndex(mat, tmp, mu); - return; - } - }; + GaugeLinkField tmp(mat._grid); + tmp = zero; +PARALLEL_FOR_LOOP + for (int ss = 0; ss < tmp._grid->oSites(); ss++) { + for (int s = 0; s < Ls; s++) { + int sF = s + Ls * ss; + auto ttmp = traceIndex(outerProduct(Btilde[sF], Atilde[sF])); + tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); + } + } + PokeIndex(mat, tmp, mu); + return; + } - typedef WilsonImpl WilsonImplR; // Real.. whichever prec - typedef WilsonImpl WilsonImplF; // Float - typedef WilsonImpl WilsonImplD; // Double +}; + typedef WilsonImpl WilsonImplR; // Real.. whichever prec + typedef WilsonImpl WilsonImplF; // Float + typedef WilsonImpl WilsonImplD; // Double - typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec - typedef WilsonImpl ZWilsonImplF; // Float - typedef WilsonImpl ZWilsonImplD; // Double + typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec + typedef WilsonImpl ZWilsonImplF; // Float + typedef WilsonImpl ZWilsonImplD; // Double + + typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec + typedef WilsonImpl WilsonAdjImplF; // Float + typedef WilsonImpl WilsonAdjImplD; // Double + + typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec + typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float + typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double + + typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec + typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float + typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double + + typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec + typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float + typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double + + typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec + typedef GparityWilsonImpl GparityWilsonImplF; // Float + typedef GparityWilsonImpl GparityWilsonImplD; // Double - typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec - typedef WilsonImpl WilsonAdjImplF; // Float - typedef WilsonImpl WilsonAdjImplD; // Double +}} - typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec - typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float - typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double - - typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double - - typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double - - typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec - typedef GparityWilsonImpl GparityWilsonImplF; // Float - typedef GparityWilsonImpl GparityWilsonImplD; // Double -} -} #endif diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index d887e05b..4bc28bc7 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -166,7 +166,7 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, //////////////////////// PARALLEL_FOR_LOOP for (int sss = 0; sss < B._grid->oSites(); sss++) { - Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu, + Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma); } @@ -277,7 +277,7 @@ void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out, PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out, + Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); } }; @@ -295,13 +295,13 @@ void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, if (dag == DaggerYes) { PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in, + Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); } } else { PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in, + Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); } } diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 7c4fafff..8c059667 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -184,44 +184,37 @@ void WilsonFermion5D::Report(void) if ( DhopCalls > 0 ) { std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime - << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " - << DhopCommTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " - << DhopComputeTime << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " - << DhopComputeTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " << DhopComputeTime << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; } if ( DerivCalls > 0 ) { - std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " < 0 || DhopCalls > 0){ - std::cout << GridLogMessage << "WilsonFermion5D Stencil"<::DerivInternal(StencilImpl & st, assert(sF < B._grid->oSites()); assert(sU < U._grid->oSites()); - Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu, - gamma); + Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma); //////////////////////////// // spin trace outer product @@ -342,10 +334,10 @@ void WilsonFermion5D::DerivInternal(StencilImpl & st, } template -void WilsonFermion5D::DhopDeriv( GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag) +void WilsonFermion5D::DhopDeriv(GaugeField &mat, + const FermionField &A, + const FermionField &B, + int dag) { conformable(A._grid,FermionGrid()); conformable(A._grid,B._grid); @@ -358,9 +350,9 @@ void WilsonFermion5D::DhopDeriv( GaugeField &mat, template void WilsonFermion5D::DhopDerivEO(GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag) + const FermionField &A, + const FermionField &B, + int dag) { conformable(A._grid,FermionRedBlackGrid()); conformable(GaugeRedBlackGrid(),mat._grid); @@ -376,9 +368,9 @@ void WilsonFermion5D::DhopDerivEO(GaugeField &mat, template void WilsonFermion5D::DhopDerivOE(GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag) + const FermionField &A, + const FermionField &B, + int dag) { conformable(A._grid,FermionRedBlackGrid()); conformable(GaugeRedBlackGrid(),mat._grid); @@ -393,8 +385,8 @@ void WilsonFermion5D::DhopDerivOE(GaugeField &mat, template void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, - DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) + DoubledGaugeField & U, + const FermionField &in, FermionField &out,int dag) { // assert((dag==DaggerNo) ||(dag==DaggerYes)); Compressor compressor(dag); @@ -412,27 +404,25 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU = ss; int sF = LLs * sU; - Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, - out); + Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out); } #ifdef AVX512 } else if (stat.is_init() ) { int nthreads; stat.start(); - #pragma omp parallel +#pragma omp parallel { - #pragma omp master +#pragma omp master nthreads = omp_get_num_threads(); int mythread = omp_get_thread_num(); stat.enter(mythread); - #pragma omp for nowait - for(int ss=0;ssoSites();ss++) - { - int sU=ss; - int sF=LLs*sU; - Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out); - } +#pragma omp for nowait + for(int ss=0;ssoSites();ss++) { + int sU=ss; + int sF=LLs*sU; + Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); + } stat.exit(mythread); } stat.accum(nthreads); @@ -442,8 +432,7 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU = ss; int sF = LLs * sU; - Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, - out); + Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); } } DhopComputeTime+=usecond(); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index bc7ec543..295e2997 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -34,155 +34,154 @@ Author: paboyle #include namespace Grid { +namespace QCD { - namespace QCD { + //////////////////////////////////////////////////////////////////////////////// + // This is the 4d red black case appropriate to support + // + // parity = (x+y+z+t)|2; + // generalised five dim fermions like mobius, zolotarev etc.. + // + // i.e. even even contains fifth dim hopping term. + // + // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ] + //////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////// - // This is the 4d red black case appropriate to support - // - // parity = (x+y+z+t)|2; - // generalised five dim fermions like mobius, zolotarev etc.. - // - // i.e. even even contains fifth dim hopping term. - // - // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ] - //////////////////////////////////////////////////////////////////////////////// - - class WilsonFermion5DStatic { - public: - // S-direction is INNERMOST and takes no part in the parity. - static const std::vector directions; - static const std::vector displacements; - const int npoint = 8; - }; - - template - class WilsonFermion5D : public WilsonKernels, public WilsonFermion5DStatic - { - public: - INHERIT_IMPL_TYPES(Impl); - typedef WilsonKernels Kernels; - PmuStat stat; - - void Report(void); - void ZeroCounters(void); - double DhopCalls; - double DhopCommTime; - double DhopComputeTime; - - double DerivCalls; - double DerivCommTime; - double DerivComputeTime; - double DerivDhopComputeTime; - - /////////////////////////////////////////////////////////////// - // Implement the abstract base - /////////////////////////////////////////////////////////////// - GridBase *GaugeGrid(void) { return _FourDimGrid ;} - GridBase *GaugeRedBlackGrid(void) { return _FourDimRedBlackGrid ;} - GridBase *FermionGrid(void) { return _FiveDimGrid;} - GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;} - - // full checkerboard operations; leave unimplemented as abstract for now - virtual RealD M (const FermionField &in, FermionField &out){assert(0); return 0.0;}; - virtual RealD Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;}; - - // half checkerboard operations; leave unimplemented as abstract for now - virtual void Meooe (const FermionField &in, FermionField &out){assert(0);}; - virtual void Mooee (const FermionField &in, FermionField &out){assert(0);}; - virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);}; - - virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);}; - virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; - virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; - virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac - - // These can be overridden by fancy 5d chiral action - virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag); - virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); - virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); - - // Implement hopping term non-hermitian hopping term; half cb or both - // Implement s-diagonal DW - void DW (const FermionField &in, FermionField &out,int dag); - void Dhop (const FermionField &in, FermionField &out,int dag); - void DhopOE(const FermionField &in, FermionField &out,int dag); - void DhopEO(const FermionField &in, FermionField &out,int dag); - - // add a DhopComm + class WilsonFermion5DStatic { + public: + // S-direction is INNERMOST and takes no part in the parity. + static const std::vector directions; + static const std::vector displacements; + const int npoint = 8; + }; + + template + class WilsonFermion5D : public WilsonKernels, public WilsonFermion5DStatic + { + public: + INHERIT_IMPL_TYPES(Impl); + typedef WilsonKernels Kernels; + PmuStat stat; + + void Report(void); + void ZeroCounters(void); + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + + double DerivCalls; + double DerivCommTime; + double DerivComputeTime; + double DerivDhopComputeTime; + + /////////////////////////////////////////////////////////////// + // Implement the abstract base + /////////////////////////////////////////////////////////////// + GridBase *GaugeGrid(void) { return _FourDimGrid ;} + GridBase *GaugeRedBlackGrid(void) { return _FourDimRedBlackGrid ;} + GridBase *FermionGrid(void) { return _FiveDimGrid;} + GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;} + + // full checkerboard operations; leave unimplemented as abstract for now + virtual RealD M (const FermionField &in, FermionField &out){assert(0); return 0.0;}; + virtual RealD Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;}; + + // half checkerboard operations; leave unimplemented as abstract for now + virtual void Meooe (const FermionField &in, FermionField &out){assert(0);}; + virtual void Mooee (const FermionField &in, FermionField &out){assert(0);}; + virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);}; + + virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);}; + virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; + virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; + virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac + + // These can be overridden by fancy 5d chiral action + virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag); + virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); + virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); + + // Implement hopping term non-hermitian hopping term; half cb or both + // Implement s-diagonal DW + void DW (const FermionField &in, FermionField &out,int dag); + void Dhop (const FermionField &in, FermionField &out,int dag); + void DhopOE(const FermionField &in, FermionField &out,int dag); + void DhopEO(const FermionField &in, FermionField &out,int dag); + + // add a DhopComm // -- suboptimal interface will presently trigger multiple comms. - void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); - - /////////////////////////////////////////////////////////////// - // New methods added - /////////////////////////////////////////////////////////////// - void DerivInternal(StencilImpl & st, - DoubledGaugeField & U, - GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag); - - void DhopInternal(StencilImpl & st, - LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, - int dag); - - // Constructors - WilsonFermion5D(GaugeField &_Umu, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - GridRedBlackCartesian &FourDimRedBlackGrid, - double _M5,const ImplParams &p= ImplParams()); - - // Constructors - /* + void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); + + /////////////////////////////////////////////////////////////// + // New methods added + /////////////////////////////////////////////////////////////// + void DerivInternal(StencilImpl & st, + DoubledGaugeField & U, + GaugeField &mat, + const FermionField &A, + const FermionField &B, + int dag); + + void DhopInternal(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, + int dag); + + // Constructors + WilsonFermion5D(GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + double _M5,const ImplParams &p= ImplParams()); + + // Constructors + /* WilsonFermion5D(int simd, - GaugeField &_Umu, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - double _M5,const ImplParams &p= ImplParams()); - */ + GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + double _M5,const ImplParams &p= ImplParams()); + */ + + // DoubleStore + void ImportGauge(const GaugeField &_Umu); + + /////////////////////////////////////////////////////////////// + // Data members require to support the functionality + /////////////////////////////////////////////////////////////// + public: + + // Add these to the support from Wilson + GridBase *_FourDimGrid; + GridBase *_FourDimRedBlackGrid; + GridBase *_FiveDimGrid; + GridBase *_FiveDimRedBlackGrid; + + double M5; + int Ls; + + //Defines the stencils for even and odd + StencilImpl Stencil; + StencilImpl StencilEven; + StencilImpl StencilOdd; + + // Copy of the gauge field , with even and odd subsets + DoubledGaugeField Umu; + DoubledGaugeField UmuEven; + DoubledGaugeField UmuOdd; + + LebesgueOrder Lebesgue; + LebesgueOrder LebesgueEvenOdd; + + // Comms buffer + std::vector > comm_buf; + + }; - // DoubleStore - void ImportGauge(const GaugeField &_Umu); - - /////////////////////////////////////////////////////////////// - // Data members require to support the functionality - /////////////////////////////////////////////////////////////// - public: - - // Add these to the support from Wilson - GridBase *_FourDimGrid; - GridBase *_FourDimRedBlackGrid; - GridBase *_FiveDimGrid; - GridBase *_FiveDimRedBlackGrid; - - double M5; - int Ls; - - //Defines the stencils for even and odd - StencilImpl Stencil; - StencilImpl StencilEven; - StencilImpl StencilOdd; - - // Copy of the gauge field , with even and odd subsets - DoubledGaugeField Umu; - DoubledGaugeField UmuEven; - DoubledGaugeField UmuOdd; - - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - - // Comms buffer - std::vector > comm_buf; - - }; - } -} +}} #endif diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 58e62cc8..52ee8704 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -43,10 +43,9 @@ WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; //////////////////////////////////////////// template -void WilsonKernels::DiracOptGenericDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, int sF, - int sU, const FermionField &in, FermionField &out) { +void WilsonKernels::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) { SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -220,10 +219,9 @@ void WilsonKernels::DiracOptGenericDhopSiteDag( // Need controls to do interior, exterior, or both template -void WilsonKernels::DiracOptGenericDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, int sF, - int sU, const FermionField &in, FermionField &out) { +void WilsonKernels::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) { SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -396,10 +394,9 @@ void WilsonKernels::DiracOptGenericDhopSite( }; template -void WilsonKernels::DiracOptDhopDir( - StencilImpl &st, DoubledGaugeField &U, - commVector &buf, int sF, - int sU, const FermionField &in, FermionField &out, int dir, int gamma) { +void WilsonKernels::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out, int dir, int gamma) { + SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteSpinor result; diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 66770c3c..669ee4be 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -32,175 +32,132 @@ directory #define GRID_QCD_DHOP_H namespace Grid { +namespace QCD { - namespace QCD { - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Helper routines that implement Wilson stencil for a single site. - // Common to both the WilsonFermion and WilsonFermion5D - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// - class WilsonKernelsStatic { - public: - // S-direction is INNERMOST and takes no part in the parity. - static int AsmOpt; // these are a temporary hack - static int HandOpt; // these are a temporary hack - }; - - template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { - public: - - INHERIT_IMPL_TYPES(Impl); - typedef FermionOperator Base; - - public: - - template - typename std::enable_if::type - DiracOptDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Helper routines that implement Wilson stencil for a single site. + // Common to both the WilsonFermion and WilsonFermion5D + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class WilsonKernelsStatic { + public: + // S-direction is INNERMOST and takes no part in the parity. + static int AsmOpt; // these are a temporary hack + static int HandOpt; // these are a temporary hack +}; + +template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { + public: + + INHERIT_IMPL_TYPES(Impl); + typedef FermionOperator Base; + +public: + + template + typename std::enable_if::type + DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { #ifdef AVX512 - if (AsmOpt) { - WilsonKernels::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns, - in, out); - - } else { + if (AsmOpt) { + WilsonKernels::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + } else { #else - { + { #endif - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if (HandOpt) - WilsonKernels::DiracOptHandDhopSite(st, lo, U, buf, sF, sU, - in, out); - else - WilsonKernels::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, - in, out); - sF++; - } - sU++; - } - } + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if (HandOpt) + WilsonKernels::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out); + else + WilsonKernels::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out); + sF++; } - - template - typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type - DiracOptDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, - out); - sF++; - } - sU++; - } - } - - template - typename std::enable_if::type - DiracOptDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { -#ifdef AVX512 - if (AsmOpt) { - WilsonKernels::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls, - Ns, in, out); - } else { -#else - { -#endif - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if (HandOpt) - WilsonKernels::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU, - in, out); - else - WilsonKernels::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, - sU, in, out); - sF++; - } - sU++; - } - } - } - - template - typename std::enable_if< - (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, - void>::type - DiracOptDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU, - in, out); - sF++; - } - sU++; - } - } - - void DiracOptDhopDir( - StencilImpl &st, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, - int gamma); - - private: - // Specialised variants - void DiracOptGenericDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - void DiracOptGenericDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - void DiracOptAsmDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out); - - void DiracOptAsmDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out); - - void DiracOptHandDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - void DiracOptHandDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - public: - WilsonKernels(const ImplParams &p = ImplParams()); - }; - + sU++; } } + } + + template + typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type + DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { + + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + WilsonKernels::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out); + sF++; + } + sU++; + } + } + + template + typename std::enable_if::type + DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { +#ifdef AVX512 + if (AsmOpt) { + WilsonKernels::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + } else { +#else + { +#endif + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if (HandOpt) + WilsonKernels::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else + WilsonKernels::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } + } + } + template + typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type + DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + WilsonKernels::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } + } + void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma); + +private: + // Specialised variants + void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + + void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + + void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + +public: + + WilsonKernels(const ImplParams &p = ImplParams()); + +}; + +}} #endif diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 7857e89a..74862400 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -33,31 +33,27 @@ Author: paboyle namespace Grid { - namespace QCD { +namespace QCD { - /////////////////////////////////////////////////////////// - // Default to no assembler implementation - /////////////////////////////////////////////////////////// - template - void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) - { - assert(0); - } - template - void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) - { - assert(0); - } - +/////////////////////////////////////////////////////////// +// Default to no assembler implementation +/////////////////////////////////////////////////////////// +template void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} +template void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} #if defined(AVX512) - /////////////////////////////////////////////////////////// // If we are AVX512 specialise the single precision routine /////////////////////////////////////////////////////////// @@ -65,16 +61,16 @@ namespace Grid { #include #include - static Vector signs; +static Vector signs; - int setupSigns(void ){ - Vector bother(2); - signs = bother; - vrsign(signs[0]); - visign(signs[1]); - return 1; - } - static int signInit = setupSigns(); + int setupSigns(void ){ + Vector bother(2); + signs = bother; + vrsign(signs[0]); + visign(signs[1]); + return 1; + } + static int signInit = setupSigns(); #define label(A) ilabel(A) #define ilabel(A) ".globl\n" #A ":\n" @@ -84,17 +80,15 @@ namespace Grid { #define FX(A) WILSONASM_ ##A #undef KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #define KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #undef VMOVIDUP @@ -109,31 +103,26 @@ namespace Grid { #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #undef KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #define KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #endif - #define INSTANTIATE_ASM(A)\ -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\ - commVector &buf,\ +template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ -template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\ - commVector &buf,\ + \ +template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ - INSTANTIATE_ASM(WilsonImplF); INSTANTIATE_ASM(WilsonImplD); INSTANTIATE_ASM(ZWilsonImplF); @@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF); INSTANTIATE_ASM(DomainWallVec5dImplD); INSTANTIATE_ASM(ZDomainWallVec5dImplF); INSTANTIATE_ASM(ZDomainWallVec5dImplD); - } -} + +}} diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 9d7eac23..f5900832 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -311,10 +311,9 @@ namespace Grid { namespace QCD { - template - void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int ss,int sU,const FermionField &in, FermionField &out) +template void +WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; @@ -554,10 +553,9 @@ namespace QCD { } } - template - void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int ss,int sU,const FermionField &in, FermionField &out) +template +void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) { // std::cout << "Hand op Dhop "< -void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } -template<> -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } -template<> -void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } -template<> -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } @@ -840,12 +835,10 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st, // Need Nc=3 though // #define INSTANTIATE_THEM(A) \ -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ - commVector &buf,\ - int ss,int sU,const FermionField &in, FermionField &out);\ -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ - commVector &buf,\ - int ss,int sU,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplD); diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc index 0e35a414..1b71b8a5 100644 --- a/tests/Test_stencil.cc +++ b/tests/Test_stencil.cc @@ -116,7 +116,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) Check._odata[i] = Foo._odata[SE->_offset]; else - Check._odata[i] = myStencil.comm_buf[SE->_offset]; + Check._odata[i] = myStencil.CommBuf()[SE->_offset]; } Real nrmC = norm2(Check); @@ -207,7 +207,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) OCheck._odata[i] = EFoo._odata[SE->_offset]; else - OCheck._odata[i] = EStencil.comm_buf[SE->_offset]; + OCheck._odata[i] = EStencil.CommBuf()[SE->_offset]; } for(int i=0;ioSites();i++){ int permute_type; @@ -220,7 +220,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) ECheck._odata[i] = OFoo._odata[SE->_offset]; else - ECheck._odata[i] = OStencil.comm_buf[SE->_offset]; + ECheck._odata[i] = OStencil.CommBuf()[SE->_offset]; } setCheckerboard(Check,ECheck); From ea25a4d9ac9d07f223a0c362d9ed7ec5fff998a8 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Sun, 23 Oct 2016 06:10:05 +0100 Subject: [PATCH 12/27] Works --- lib/Stencil.h | 60 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 6b942539..f5fa1f8b 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -289,11 +289,17 @@ PARALLEL_FOR_LOOP /////////////////////////////////////////////////////////// // Unified Comms buffers for all directions /////////////////////////////////////////////////////////// - std::vector > u_simd_send_buf; - std::vector > u_simd_recv_buf; + // Vectors that live on the symmetric heap in case of SHMEM + std::vector > u_simd_send_buf_hide; + std::vector > u_simd_recv_buf_hide; commVector u_send_buf; commVector u_recv_buf_hide; + // These are used; either SHM objects or refs to the above symmetric heap vectors + // depending on comms target cobj* u_recv_buf_p; + cobj* u_send_buf_p; + std::vector u_simd_send_buf; + std::vector u_simd_recv_buf; int u_comm_offset; int _unified_buffer_size; @@ -427,32 +433,42 @@ PARALLEL_FOR_LOOP } } } - u_send_buf.resize(_unified_buffer_size); ///////////////////////////////////////////////////////////////////////////////// // Try to allocate for receiving in a shared memory region, fall back to buffer ///////////////////////////////////////////////////////////////////////////////// - if( ShmDirectCopy ) { + const int Nsimd = grid->Nsimd(); - u_recv_buf_p=(cobj *)_grid->ShmBufferSelf(); - if ( u_recv_buf_p == NULL ) { - u_recv_buf_hide.resize(_unified_buffer_size); - u_recv_buf_p=&u_recv_buf_hide[0]; + uint8_t *shm_ptr = (uint8_t *)_grid->ShmBufferSelf(); + + u_simd_send_buf.resize(Nsimd); + u_simd_recv_buf.resize(Nsimd); + + u_send_buf.resize(_unified_buffer_size); + + if( ShmDirectCopy && shm_ptr != NULL ) { + + u_recv_buf_p=(cobj *)shm_ptr; shm_ptr+= _unified_buffer_size*sizeof(cobj); + for(int l=0;l(_unified_buffer_size)); + u_simd_recv_buf_hide.resize(Nsimd,commVector(_unified_buffer_size)); + u_recv_buf_p=&u_recv_buf_hide[0]; + for(int l=0;lNsimd(); - u_simd_send_buf.resize(Nsimd); - u_simd_recv_buf.resize(Nsimd); - for(int l=0;lShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - + + AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); + + auto shm_or_rp = rp; + if (ShmDirectCopy) { + scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(xmit_to_rank,sp); + if ( shm!=NULL) { + shm_or_rp = shm; + } + } - rpointers[i] = rp; + rpointers[i] = shm_or_rp; } else { @@ -942,7 +967,6 @@ PARALLEL_FOR_LOOP } } - assert(0); AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1); u_comm_offset +=buffer_size; From b6a65059a21f00c2d9f2a01f10f550996d6acb02 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Mon, 24 Oct 2016 17:30:43 +0100 Subject: [PATCH 13/27] Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions --- benchmarks/Benchmark_dwf.cc | 21 +- lib/Init.cc | 14 +- lib/Init.h | 2 + lib/Log.cc | 5 + lib/Log.h | 59 ++-- lib/Makefile.am | 4 + lib/Stencil.h | 73 ++--- lib/communicator/Communicator_base.cc | 132 +++++++++ lib/communicator/Communicator_base.h | 313 ++++++++++---------- lib/communicator/Communicator_mpi.cc | 41 +-- lib/communicator/Communicator_mpi3.cc | 388 ++++++++++++++----------- lib/communicator/Communicator_none.cc | 55 ++-- lib/communicator/Communicator_shmem.cc | 57 ++-- 13 files changed, 706 insertions(+), 458 deletions(-) create mode 100644 lib/communicator/Communicator_base.cc diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index f75f0385..c9e2fa22 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -156,6 +156,7 @@ int main (int argc, char ** argv) std::cout<1.0e-5) { + setCheckerboard(ssrc,ssrc_o); + setCheckerboard(ssrc,ssrc_e); + std::cout<< ssrc << std::endl; + } } @@ -306,7 +314,7 @@ int main (int argc, char ** argv) std::cout< & vec){ ///////////////////////////////////////////////////////// // ///////////////////////////////////////////////////////// +static int Grid_is_initialised = 0; + + void Grid_init(int *argc,char ***argv) { + GridLogger::StopWatch.Start(); + CartesianCommunicator::Init(argc,argv); // Parse command line args. - GridLogger::StopWatch.Start(); - std::string arg; std::vector logstreams; std::string defaultLog("Error,Warning,Message,Performance"); @@ -216,11 +219,14 @@ void Grid_init(int *argc,char ***argv) if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ LebesgueOrder::UseLebesgueOrder=1; } - if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); GridCmdOptionIntVector(arg,LebesgueOrder::Block); } + if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){ + GridLogTimestamp(1); + } + GridParseLayout(*argv,*argc, Grid_default_latt, Grid_default_mpi); @@ -274,6 +280,8 @@ void Grid_init(int *argc,char ***argv) std::cout << "GNU General Public License for more details."< &GridDefaultMpi(void); const int &GridThreads(void) ; void GridSetThreads(int t) ; + void GridLogTimestamp(int); // Common parsing chores std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option); diff --git a/lib/Log.cc b/lib/Log.cc index a55a1c9e..d4ac42ee 100644 --- a/lib/Log.cc +++ b/lib/Log.cc @@ -34,8 +34,13 @@ directory namespace Grid { GridStopWatch Logger::StopWatch; +int Logger::timestamp; std::ostream Logger::devnull(0); +void GridLogTimestamp(int on){ + Logger::Timestamp(on); +} + Colours GridLogColours(0); GridLogger GridLogError(1, "Error", GridLogColours, "RED"); GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW"); diff --git a/lib/Log.h b/lib/Log.h index 156f52ee..dd3fe927 100644 --- a/lib/Log.h +++ b/lib/Log.h @@ -37,10 +37,11 @@ #include #endif - namespace Grid { +namespace Grid { +////////////////////////////////////////////////////////////////////////////////////////////////// // Dress the output; use std::chrono for time stamping via the StopWatch class -int Rank(void); // used for early stage debug before library init +////////////////////////////////////////////////////////////////////////////////////////////////// class Colours{ @@ -55,7 +56,6 @@ public: void Active(bool activate){ is_active=activate; - if (is_active){ colour["BLACK"] ="\033[30m"; colour["RED"] ="\033[31m"; @@ -66,21 +66,18 @@ public: colour["CYAN"] ="\033[36m"; colour["WHITE"] ="\033[37m"; colour["NORMAL"] ="\033[0;39m"; - } else { - colour["BLACK"] =""; - colour["RED"] =""; - colour["GREEN"] =""; - colour["YELLOW"]=""; - colour["BLUE"] =""; - colour["PURPLE"]=""; - colour["CYAN"] =""; - colour["WHITE"] =""; - colour["NORMAL"]=""; - } - - -}; - + } else { + colour["BLACK"] =""; + colour["RED"] =""; + colour["GREEN"] =""; + colour["YELLOW"]=""; + colour["BLUE"] =""; + colour["PURPLE"]=""; + colour["CYAN"] =""; + colour["WHITE"] =""; + colour["NORMAL"]=""; + } + }; }; @@ -88,6 +85,7 @@ class Logger { protected: Colours &Painter; int active; + static int timestamp; std::string name, topName; std::string COLOUR; @@ -99,25 +97,28 @@ public: std::string evidence() {return Painter.colour["YELLOW"];} std::string colour() {return Painter.colour[COLOUR];} - Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) - : active(on), - name(nm), - topName(topNm), - Painter(col_class), - COLOUR(col){} ; + Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) : active(on), + name(nm), + topName(topNm), + Painter(col_class), + COLOUR(col) {} ; void Active(int on) {active = on;}; int isActive(void) {return active;}; + static void Timestamp(int on) {timestamp = on;}; friend std::ostream& operator<< (std::ostream& stream, Logger& log){ if ( log.active ) { - StopWatch.Stop(); - GridTime now = StopWatch.Elapsed(); - StopWatch.Start(); stream << log.background()<< log.topName << log.background()<< " : "; stream << log.colour() < > u_simd_send_buf_hide; - std::vector > u_simd_recv_buf_hide; - commVector u_send_buf; - commVector u_recv_buf_hide; + // std::vector > u_simd_send_buf_hide; + // std::vector > u_simd_recv_buf_hide; + // commVector u_send_buf_hide; + // commVector u_recv_buf_hide; + // These are used; either SHM objects or refs to the above symmetric heap vectors // depending on comms target cobj* u_recv_buf_p; @@ -439,36 +440,19 @@ PARALLEL_FOR_LOOP ///////////////////////////////////////////////////////////////////////////////// const int Nsimd = grid->Nsimd(); - uint8_t *shm_ptr = (uint8_t *)_grid->ShmBufferSelf(); + _grid->ShmBufferFreeAll(); u_simd_send_buf.resize(Nsimd); u_simd_recv_buf.resize(Nsimd); - u_send_buf.resize(_unified_buffer_size); - - if( ShmDirectCopy && shm_ptr != NULL ) { - - u_recv_buf_p=(cobj *)shm_ptr; shm_ptr+= _unified_buffer_size*sizeof(cobj); - for(int l=0;l(_unified_buffer_size)); - u_simd_recv_buf_hide.resize(Nsimd,commVector(_unified_buffer_size)); - - u_recv_buf_p=&u_recv_buf_hide[0]; - for(int l=0;lShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + for(int l=0;lShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); + u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); } PrecomputeByteOffsets(); - } void Local (int point, int dimension,int shiftpm,int cbmask) @@ -698,6 +682,7 @@ PARALLEL_FOR_LOOP calls++; Mergers.resize(0); Packets.resize(0); + _grid->StencilBarrier(); HaloGather(source,compress); this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); @@ -836,19 +821,17 @@ PARALLEL_FOR_LOOP // try the direct copy if possible ///////////////////////////////////////////////////////// - cobj *u_send_buf_p = &u_send_buf[0]; - if (ShmDirectCopy) { - cobj *shm = (cobj *) _grid->ShmBuffer(xmit_to_rank); - if ( shm!=NULL) { - u_send_buf_p = shm; - } + + cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); + if ( (ShmDirectCopy==0)||send_buf==NULL ) { + cobj *send_buf = u_send_buf_p; } t_data-=usecond(); - Gather_plane_simple_table (face_table[face_idx],rhs,u_send_buf_p,compress,u_comm_offset,so); face_idx++; + Gather_plane_simple_table (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; t_data+=usecond(); - AddPacket((void *)&u_send_buf_p[u_comm_offset], + AddPacket((void *)&send_buf[u_comm_offset], (void *)&u_recv_buf_p[u_comm_offset], xmit_to_rank, recv_from_rank, @@ -947,18 +930,16 @@ PARALLEL_FOR_LOOP _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - - auto shm_or_rp = rp; - if (ShmDirectCopy) { - scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(xmit_to_rank,sp); - if ( shm!=NULL) { - shm_or_rp = shm; - } - } + scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); + if ((ShmDirectCopy==0)||(shm==NULL)) { + shm = rp; + } - rpointers[i] = shm_or_rp; + // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node + // assuming above pointer flip + AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); + + rpointers[i] = shm; } else { diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc new file mode 100644 index 00000000..1272b6a2 --- /dev/null +++ b/lib/communicator/Communicator_base.cc @@ -0,0 +1,132 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/communicator/Communicator_none.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include "Grid.h" +namespace Grid { + +/////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////// +int CartesianCommunicator::ShmRank; +int CartesianCommunicator::ShmSize; +int CartesianCommunicator::GroupRank; +int CartesianCommunicator::GroupSize; +int CartesianCommunicator::WorldRank; +int CartesianCommunicator::WorldSize; +int CartesianCommunicator::Slave; +void * CartesianCommunicator::ShmCommBuf; + +///////////////////////////////// +// Alloc, free shmem region +///////////////////////////////// +void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){ + // bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes + void *ptr = (void *)heap_top; + heap_top += bytes; + heap_bytes+= bytes; + assert(heap_bytes < MAX_MPI_SHM_BYTES); + return ptr; +} +void *CartesianCommunicator::ShmBufferFreeAll(void) { + heap_top =(size_t)ShmBufferSelf(); + heap_bytes=0; +} + +///////////////////////////////// +// Grid information queries +///////////////////////////////// +int CartesianCommunicator::IsBoss(void) { return _processor==0; }; +int CartesianCommunicator::BossRank(void) { return 0; }; +int CartesianCommunicator::ThisRank(void) { return _processor; }; +const std::vector & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; }; +const std::vector & CartesianCommunicator::ProcessorGrid(void) { return _processors; }; +int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; }; + +//////////////////////////////////////////////////////////////////////////////// +// very VERY rarely (Log, serial RNG) we need world without a grid +//////////////////////////////////////////////////////////////////////////////// +int CartesianCommunicator::RankWorld(void) { return WorldRank; }; +int CartesianCommunicator::Ranks (void) { return WorldSize; }; +int CartesianCommunicator::Nodes (void) { return GroupSize; }; +int CartesianCommunicator::Cores (void) { return ShmSize; }; +int CartesianCommunicator::NodeRank (void) { return GroupRank; }; +int CartesianCommunicator::CoreRank (void) { return ShmRank; }; + +void CartesianCommunicator::GlobalSum(ComplexF &c) +{ + GlobalSumVector((float *)&c,2); +} +void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) +{ + GlobalSumVector((float *)c,2*N); +} +void CartesianCommunicator::GlobalSum(ComplexD &c) +{ + GlobalSumVector((double *)&c,2); +} +void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) +{ + GlobalSumVector((double *)c,2*N); +} + +#ifndef GRID_COMMS_MPI3 + +void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes) +{ + SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); +} +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall) +{ + SendToRecvFromComplete(waitall); +} +void StencilBarrier(void){}; + +commVector CartesianCommunicator::ShmBufStorageVector; + +void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; } +void *CartesianCommunicator::ShmBuffer(int rank) { + if (rank != ShmRank ) return NULL; + else return ShmCommBuf; +} +void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { + if (rank != ShmRank ) return NULL; + else return local_p; +} +void CartesianCommunicator::ShmInitGeneric(void){ + ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); + ShmCommBuf=(void *)&ShmBufStorageVector[0]; +} + +#endif + +} + diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 4139f72a..576c749e 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -40,169 +40,188 @@ Author: Peter Boyle #ifdef GRID_COMMS_SHMEM #include #endif + namespace Grid { + class CartesianCommunicator { public: + // 65536 ranks per node adequate for now + // 128MB shared memory for comms enought for 48^4 local vol comms + // Give external control (command line override?) of this + + static const int MAXLOG2RANKSPERNODE = 16; + static const uint64_t MAX_MPI_SHM_BYTES = 128*1024*1024; + // Communicator should know nothing of the physics grid, only processor grid. - - int _Nprocessors; // How many in all - std::vector _processors; // Which dimensions get relayed out over processors lanes. - int _processor; // linear processor rank - std::vector _processor_coor; // linear processor coordinate - unsigned long _ndimension; + int _Nprocessors; // How many in all + std::vector _processors; // Which dimensions get relayed out over processors lanes. + int _processor; // linear processor rank + std::vector _processor_coor; // linear processor coordinate + unsigned long _ndimension; -#ifdef GRID_COMMS_MPI - MPI_Comm communicator; - typedef MPI_Request CommsRequest_t; -#elif GRID_COMMS_MPI3 - int shm_mode; - - MPI_Comm communicator; - typedef MPI_Request CommsRequest_t; - - const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now - const uint64_t MAX_MPI_SHM_BYTES = 256*1024*1024; // 256MB shared memory for comms enought for 48^4 local vol comms - - std::vector WorldDims; - std::vector GroupDims; - std::vector ShmDims; - - std::vector GroupCoor; - std::vector ShmCoor; - std::vector WorldCoor; - - static std::vector GroupRanks; - static std::vector MyGroup; - static int ShmSetup; - static MPI_Win ShmWindow; - static MPI_Comm ShmComm; - - void * ShmCommBuf; - std::vector ShmCommBufs; - - int WorldRank; - int WorldSize; - - static int ShmRank; - static int ShmSize; - static int GroupSize; - static int GroupRank; - - std::vector LexicographicToWorldRank; +#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) + MPI_Comm communicator; + static MPI_Comm communicator_world; + typedef MPI_Request CommsRequest_t; #else - typedef int CommsRequest_t; + typedef int CommsRequest_t; #endif - static void Init(int *argc, char ***argv); + //////////////////////////////////////////////////////////////////// + // Helper functionality for SHM Windows common to all other impls + //////////////////////////////////////////////////////////////////// + // Longer term; drop this in favour of a master / slave model with + // cartesian communicator on a subset of ranks, slave ranks controlled + // by group leader with data xfer via shared memory + //////////////////////////////////////////////////////////////////// +#ifdef GRID_COMMS_MPI3 + std::vector WorldDims; + std::vector GroupDims; + std::vector ShmDims; + + std::vector GroupCoor; + std::vector ShmCoor; + std::vector WorldCoor; + + static std::vector GroupRanks; + static std::vector MyGroup; + static int ShmSetup; + static MPI_Win ShmWindow; + static MPI_Comm ShmComm; + + std::vector LexicographicToWorldRank; + + static std::vector ShmCommBufs; +#else + static void ShmInitGeneric(void); + static commVector ShmBufStorageVector; +#endif + static void * ShmCommBuf; + size_t heap_top; + size_t heap_bytes; + void *ShmBufferSelf(void); + void *ShmBuffer(int rank); + void *ShmBufferTranslate(int rank,void * local_p); + void *ShmBufferMalloc(size_t bytes); + void *ShmBufferFreeAll(void) ; + + //////////////////////////////////////////////// + // Must call in Grid startup + //////////////////////////////////////////////// + static void Init(int *argc, char ***argv); + + //////////////////////////////////////////////// + // Constructor of any given grid + //////////////////////////////////////////////// + CartesianCommunicator(const std::vector &pdimensions_in); + + //////////////////////////////////////////////////////////////////////////////////////// + // Wraps MPI_Cart routines, or implements equivalent on other impls + //////////////////////////////////////////////////////////////////////////////////////// + void ShiftedRanks(int dim,int shift,int & source, int & dest); + int RankFromProcessorCoor(std::vector &coor); + void ProcessorCoorFromRank(int rank,std::vector &coor); + + ///////////////////////////////// + // Grid information and queries + ///////////////////////////////// + static int ShmRank; + static int ShmSize; + static int GroupSize; + static int GroupRank; + static int WorldRank; + static int WorldSize; + static int Slave; + + int IsBoss(void) ; + int BossRank(void) ; + int ThisRank(void) ; + const std::vector & ThisProcessorCoor(void) ; + const std::vector & ProcessorGrid(void) ; + int ProcessorCount(void) ; + static int Ranks (void); + static int Nodes (void); + static int Cores (void); + static int NodeRank (void); + static int CoreRank (void); - // Constructor - CartesianCommunicator(const std::vector &pdimensions_in); + //////////////////////////////////////////////////////////////////////////////// + // very VERY rarely (Log, serial RNG) we need world without a grid + //////////////////////////////////////////////////////////////////////////////// + static int RankWorld(void) ; + static void BroadcastWorld(int root,void* data, int bytes); + + //////////////////////////////////////////////////////////// + // Reduction + //////////////////////////////////////////////////////////// + void GlobalSum(RealF &); + void GlobalSumVector(RealF *,int N); + void GlobalSum(RealD &); + void GlobalSumVector(RealD *,int N); + void GlobalSum(uint32_t &); + void GlobalSum(uint64_t &); + void GlobalSum(ComplexF &c); + void GlobalSumVector(ComplexF *c,int N); + void GlobalSum(ComplexD &c); + void GlobalSumVector(ComplexD *c,int N); + + template void GlobalSum(obj &o){ + typedef typename obj::scalar_type scalar_type; + int words = sizeof(obj)/sizeof(scalar_type); + scalar_type * ptr = (scalar_type *)& o; + GlobalSumVector(ptr,words); + } + + //////////////////////////////////////////////////////////// + // Face exchange, buffer swap in translational invariant way + //////////////////////////////////////////////////////////// + void SendToRecvFrom(void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes); + + void SendRecvPacket(void *xmit, + void *recv, + int xmit_to_rank, + int recv_from_rank, + int bytes); + + void SendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes); + + void SendToRecvFromComplete(std::vector &waitall); - // Wraps MPI_Cart routines - void ShiftedRanks(int dim,int shift,int & source, int & dest); - int RankFromProcessorCoor(std::vector &coor); - void ProcessorCoorFromRank(int rank,std::vector &coor); + void StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes); + + void StencilSendToRecvFromComplete(std::vector &waitall); + void StencilBarrier(void); - // Helper function for SHM Windows in MPI3 - void *ShmBufferSelf(void); - void *ShmBuffer(int rank); - - ///////////////////////////////// - // Grid information queries - ///////////////////////////////// - int IsBoss(void) { return _processor==0; }; - int BossRank(void) { return 0; }; - int ThisRank(void) { return _processor; }; - const std::vector & ThisProcessorCoor(void) { return _processor_coor; }; - const std::vector & ProcessorGrid(void) { return _processors; }; - int ProcessorCount(void) { return _Nprocessors; }; - - //////////////////////////////////////////////////////////// - // Reduction - //////////////////////////////////////////////////////////// - void GlobalSum(RealF &); - void GlobalSumVector(RealF *,int N); - - void GlobalSum(RealD &); - void GlobalSumVector(RealD *,int N); - - void GlobalSum(uint32_t &); - void GlobalSum(uint64_t &); - - void GlobalSum(ComplexF &c) - { - GlobalSumVector((float *)&c,2); - } - void GlobalSumVector(ComplexF *c,int N) - { - GlobalSumVector((float *)c,2*N); - } - - void GlobalSum(ComplexD &c) - { - GlobalSumVector((double *)&c,2); - } - void GlobalSumVector(ComplexD *c,int N) - { - GlobalSumVector((double *)c,2*N); - } - - template void GlobalSum(obj &o){ - typedef typename obj::scalar_type scalar_type; - int words = sizeof(obj)/sizeof(scalar_type); - scalar_type * ptr = (scalar_type *)& o; - GlobalSumVector(ptr,words); - } - //////////////////////////////////////////////////////////// - // Face exchange, buffer swap in translational invariant way - //////////////////////////////////////////////////////////// - void SendToRecvFrom(void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); - - void SendRecvPacket(void *xmit, - void *recv, - int xmit_to_rank, - int recv_from_rank, - int bytes); - - void SendToRecvFromBegin(std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); - void SendToRecvFromComplete(std::vector &waitall); - void StencilSendToRecvFromBegin(std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); - void StencilSendToRecvFromComplete(std::vector &waitall) - { - SendToRecvFromComplete(waitall); - } - - //////////////////////////////////////////////////////////// - // Barrier - //////////////////////////////////////////////////////////// - void Barrier(void); - - //////////////////////////////////////////////////////////// - // Broadcast a buffer and composite larger - //////////////////////////////////////////////////////////// - void Broadcast(int root,void* data, int bytes); - template void Broadcast(int root,obj &data) + //////////////////////////////////////////////////////////// + // Barrier + //////////////////////////////////////////////////////////// + void Barrier(void); + + //////////////////////////////////////////////////////////// + // Broadcast a buffer and composite larger + //////////////////////////////////////////////////////////// + void Broadcast(int root,void* data, int bytes); + + template void Broadcast(int root,obj &data) { Broadcast(root,(void *)&data,sizeof(data)); }; - static void BroadcastWorld(int root,void* data, int bytes); - }; } diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 4291b319..a638eebb 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -30,19 +30,28 @@ Author: Peter Boyle namespace Grid { - // Should error check all MPI calls. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// +MPI_Comm CartesianCommunicator::communicator_world; + +// Should error check all MPI calls. void CartesianCommunicator::Init(int *argc, char ***argv) { int flag; MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { MPI_Init(argc,argv); } -} - -int Rank(void) { - int pe; - MPI_Comm_rank(MPI_COMM_WORLD,&pe); - return pe; + MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); + MPI_Comm_rank(communicator_world,&WorldRank); + MPI_Comm_size(communicator_world,&WorldSize); + ShmRank=0; + ShmSize=1; + GroupRank=WorldRank; + GroupSize=WorldSize; + Slave =0; + ShmInitGeneric(); } CartesianCommunicator::CartesianCommunicator(const std::vector &processors) @@ -54,7 +63,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) _processors = processors; _processor_coor.resize(_ndimension); - MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator); + MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator); MPI_Comm_rank(communicator,&_processor); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); @@ -67,15 +76,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) assert(Size==_Nprocessors); } -void *CartesianCommunicator::ShmBufferSelf(void) -{ - return NULL; -} -void *CartesianCommunicator::ShmBuffer(int rank) -{ - return NULL; -} - void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); @@ -194,14 +194,17 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) communicator); assert(ierr==0); } - + /////////////////////////////////////////////////////// + // Should only be used prior to Grid Init finished. + // Check for this? + /////////////////////////////////////////////////////// void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { int ierr= MPI_Bcast(data, bytes, MPI_BYTE, root, - MPI_COMM_WORLD); + communicator_world); assert(ierr==0); } diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index f5bbdbda..00b0ca11 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -1,4 +1,3 @@ - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -33,26 +32,197 @@ Author: Peter Boyle namespace Grid { +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// +int CartesianCommunicator::ShmSetup = 0; -// Global used by Init and nowhere else. How to hide? -int Rank(void) { - int pe; - MPI_Comm_rank(MPI_COMM_WORLD,&pe); - return pe; +MPI_Comm CartesianCommunicator::communicator_world; +MPI_Comm CartesianCommunicator::ShmComm; +MPI_Win CartesianCommunicator::ShmWindow; + +std::vector CartesianCommunicator::GroupRanks; +std::vector CartesianCommunicator::MyGroup; +std::vector CartesianCommunicator::ShmCommBufs; + +void *CartesianCommunicator::ShmBufferSelf(void) +{ + return ShmCommBufs[ShmRank]; } - // Should error check all MPI calls. +void *CartesianCommunicator::ShmBuffer(int rank) +{ + int gpeer = GroupRanks[rank]; + if (gpeer == MPI_UNDEFINED){ + return NULL; + } else { + return ShmCommBufs[gpeer]; + } +} +void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) +{ + int gpeer = GroupRanks[rank]; + if (gpeer == MPI_UNDEFINED){ + return NULL; + } else { + uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank]; + uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset; + return (void *) remote; + } +} + void CartesianCommunicator::Init(int *argc, char ***argv) { int flag; MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { MPI_Init(argc,argv); } -} - //////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Want to implement some magic ... Group sub-cubes into those on same node - // - //////////////////////////////////////////////////////////////////////////////////////////////////////////// + MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); + MPI_Comm_rank(communicator_world,&WorldRank); + MPI_Comm_size(communicator_world,&WorldSize); + + ///////////////////////////////////////////////////////////////////// + // Split into groups that can share memory + ///////////////////////////////////////////////////////////////////// + MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm); + MPI_Comm_rank(ShmComm ,&ShmRank); + MPI_Comm_size(ShmComm ,&ShmSize); + GroupSize = WorldSize/ShmSize; + + ///////////////////////////////////////////////////////////////////// + // find world ranks in our SHM group (i.e. which ranks are on our node) + ///////////////////////////////////////////////////////////////////// + MPI_Group WorldGroup, ShmGroup; + MPI_Comm_group (communicator_world, &WorldGroup); + MPI_Comm_group (ShmComm, &ShmGroup); + + std::vector world_ranks(WorldSize); + GroupRanks.resize(WorldSize); + MyGroup.resize(ShmSize); + for(int r=0;r()); + int myleader = MyGroup[0]; + + std::vector leaders_1hot(WorldSize,0); + std::vector leaders_group(GroupSize,0); + leaders_1hot [ myleader ] = 1; + + /////////////////////////////////////////////////////////////////// + // global sum leaders over comm world + /////////////////////////////////////////////////////////////////// + int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world); + assert(ierr==0); + + /////////////////////////////////////////////////////////////////// + // find the group leaders world rank + /////////////////////////////////////////////////////////////////// + int group=0; + for(int l=0;l coor = _processor_coor; @@ -80,139 +250,13 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &c rank = LexicographicToWorldRank[rank]; } -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Info that is setup once and indept of cartesian layout -/////////////////////////////////////////////////////////////////////////////////////////////////// -int CartesianCommunicator::ShmSetup = 0; -int CartesianCommunicator::ShmRank; -int CartesianCommunicator::ShmSize; -int CartesianCommunicator::GroupRank; -int CartesianCommunicator::GroupSize; -MPI_Comm CartesianCommunicator::ShmComm; -MPI_Win CartesianCommunicator::ShmWindow; -std::vector CartesianCommunicator::GroupRanks; -std::vector CartesianCommunicator::MyGroup; - CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { - - _ndimension = processors.size(); - - WorldDims = processors; - - communicator = MPI_COMM_WORLD; - MPI_Comm_rank(communicator,&WorldRank); - MPI_Comm_size(communicator,&WorldSize); - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free. - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Does every grid need one, or could we share across all grids via a singleton/guard? int ierr; - if ( !ShmSetup ) { + communicator=communicator_world; - MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm); - MPI_Comm_rank(ShmComm ,&ShmRank); - MPI_Comm_size(ShmComm ,&ShmSize); - GroupSize = WorldSize/ShmSize; - - ///////////////////////////////////////////////////////////////////// - // find world ranks in our SHM group (i.e. which ranks are on our node) - ///////////////////////////////////////////////////////////////////// - MPI_Group WorldGroup, ShmGroup; - MPI_Comm_group (communicator, &WorldGroup); - MPI_Comm_group (ShmComm, &ShmGroup); - - std::vector world_ranks(WorldSize); - GroupRanks.resize(WorldSize); - MyGroup.resize(ShmSize); - for(int r=0;r()); - int myleader = MyGroup[0]; - - std::vector leaders_1hot(WorldSize,0); - std::vector leaders_group(GroupSize,0); - leaders_1hot [ myleader ] = 1; - - /////////////////////////////////////////////////////////////////// - // global sum leaders over comm world - /////////////////////////////////////////////////////////////////// - ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator); - assert(ierr==0); - - /////////////////////////////////////////////////////////////////// - // find the group leaders world rank - /////////////////////////////////////////////////////////////////// - int group=0; - for(int l=0;l &processors) //////////////////////////////////////////////////////////////// int dim = 0; + std::vector WorldDims = processors; + ShmDims.resize(_ndimension,1); GroupDims.resize(_ndimension); @@ -346,21 +392,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, } } - -void *CartesianCommunicator::ShmBufferSelf(void) -{ - return ShmCommBufs[ShmRank]; -} -void *CartesianCommunicator::ShmBuffer(int rank) -{ - int gpeer = GroupRanks[rank]; - if (gpeer == MPI_UNDEFINED){ - return NULL; - } else { - return ShmCommBufs[gpeer]; - } -} - // Basic Halo comms primitive void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, void *xmit, @@ -369,6 +400,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis int from, int bytes) { +#if 1 MPI_Request xrq; MPI_Request rrq; @@ -387,12 +419,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis sequence++; - char *to_ptr = (char *)ShmCommBufs[gdest]; char *from_ptr = (char *)ShmCommBufs[ShmRank]; int small = (bytes &lis if ( small && (gdest !=MPI_UNDEFINED) ) { + char *to_ptr = (char *)ShmCommBufs[gdest]; + assert(gme != gdest); T *ip = (T *)xmit; T *op = (T *)to_ptr; PARALLEL_FOR_LOOP for(int w=0;w "<< gdest<<" " < &list, @@ -476,19 +528,29 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list) +{ + SendToRecvFromComplete(list); +} + +void CartesianCommunicator::StencilBarrier(void) +{ MPI_Win_sync (ShmWindow); MPI_Barrier (ShmComm); MPI_Win_sync (ShmWindow); - } - void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) { int nreq=list.size(); std::vector status(nreq); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); - assert(ierr==0); } @@ -514,7 +576,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) bytes, MPI_BYTE, root, - MPI_COMM_WORLD); + communicator_world); assert(ierr==0); } diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index 80b8fb90..d32fe4fa 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -28,18 +28,29 @@ Author: Peter Boyle #include "Grid.h" namespace Grid { +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// +int CartesianCommunicator::ShmRank; +int CartesianCommunicator::ShmSize; +int CartesianCommunicator::GroupRank; +int CartesianCommunicator::GroupSize; +int CartesianCommunicator::WorldRank; +int CartesianCommunicator::WorldSize; +int CartesianCommunicator::Slave; +void * CartesianCommunicator::ShmCommBuf; +commVector CartesianCommunicator::ShmBufStorageVector; + void CartesianCommunicator::Init(int *argc, char *** arv) { -} - -int Rank(void ){ return 0; }; -void *CartesianCommunicator::ShmBufferSelf(void) -{ - return NULL; -} -void *CartesianCommunicator::ShmBuffer(int rank) -{ - return NULL; + WorldRank = 0; + WorldSize = 1; + ShmRank=0; + ShmSize=1; + GroupRank=_WorldRank; + GroupSize=_WorldSize; + Slave =0; + ShmInitGeneric(); } CartesianCommunicator::CartesianCommunicator(const std::vector &processors) @@ -97,30 +108,16 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector & assert(0); } -void CartesianCommunicator::Barrier(void) -{ -} - -void CartesianCommunicator::Broadcast(int root,void* data, int bytes) -{ -} -void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) -{ -} - - +void CartesianCommunicator::Barrier(void){} +void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} +void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } +int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) { return 0;} +void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor){ assert(0);} void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) { source =0; dest=0; } -int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) -{ - return 0; -} -void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor) -{ -} } diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc index 4af719b0..544b37c7 100644 --- a/lib/communicator/Communicator_shmem.cc +++ b/lib/communicator/Communicator_shmem.cc @@ -39,25 +39,19 @@ namespace Grid { BACKTRACEFILE(); \ }\ } -int Rank(void) { - return shmem_my_pe(); -} -typedef struct HandShake_t { - uint64_t seq_local; - uint64_t seq_remote; -} HandShake; -static Vector< HandShake > XConnections; -static Vector< HandShake > RConnections; -void *CartesianCommunicator::ShmBufferSelf(void) -{ - return NULL; -} -void *CartesianCommunicator::ShmBuffer(int rank) -{ - return NULL; -} +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// +int CartesianCommunicator::ShmRank; +int CartesianCommunicator::ShmSize; +int CartesianCommunicator::GroupRank; +int CartesianCommunicator::GroupSize; +int CartesianCommunicator::WorldRank; +int CartesianCommunicator::WorldSize; +int CartesianCommunicator::Slave; + void CartesianCommunicator::Init(int *argc, char ***argv) { shmem_init(); XConnections.resize(shmem_n_pes()); @@ -69,7 +63,36 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { RConnections[pe].seq_remote= 0; } shmem_barrier_all(); + ShmInitGeneric(); } + + +// Should error check all MPI calls. +void CartesianCommunicator::Init(int *argc, char ***argv) { + int flag; + MPI_Initialized(&flag); // needed to coexist with other libs apparently + if ( !flag ) { + MPI_Init(argc,argv); + MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); + MPI_Comm_rank(communicator_world,&_WorldRank); + MPI_Comm_size(communicator_world,&_WorldSize); + _ShmRank=0; + _ShmSize=1; + _GroupRank=_WorldRank; + _GroupSize=_WorldSize; + _Slave =0; + } +} + + +typedef struct HandShake_t { + uint64_t seq_local; + uint64_t seq_remote; +} HandShake; + +static Vector< HandShake > XConnections; +static Vector< HandShake > RConnections; + CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { _ndimension = processors.size(); From 392e0645137a988dea02f74e27ba2a8ad8182328 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Mon, 24 Oct 2016 19:24:21 +0100 Subject: [PATCH 14/27] fast local peek-poke --- lib/lattice/Lattice_peekpoke.h | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/lib/lattice/Lattice_peekpoke.h b/lib/lattice/Lattice_peekpoke.h index 9bece943..19d349c4 100644 --- a/lib/lattice/Lattice_peekpoke.h +++ b/lib/lattice/Lattice_peekpoke.h @@ -154,7 +154,7 @@ PARALLEL_FOR_LOOP template void peekLocalSite(sobj &s,const Lattice &l,std::vector &site){ - GridBase *grid=l._grid; + GridBase *grid = l._grid; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -164,16 +164,18 @@ PARALLEL_FOR_LOOP assert( l.checkerboard== l._grid->CheckerBoard(site)); assert( sizeof(sobj)*Nsimd == sizeof(vobj)); + static const int words=sizeof(vobj)/sizeof(vector_type); int odx,idx; idx= grid->iIndex(site); odx= grid->oIndex(site); - std::vector buf(Nsimd); - - extract(l._odata[odx],buf); + scalar_type * vp = (scalar_type *)&l._odata[odx]; + scalar_type * pt = (scalar_type *)&s; + + for(int w=0;wCheckerBoard(site)); assert( sizeof(sobj)*Nsimd == sizeof(vobj)); + static const int words=sizeof(vobj)/sizeof(vector_type); int odx,idx; idx= grid->iIndex(site); odx= grid->oIndex(site); - std::vector buf(Nsimd); - - // extract-modify-merge cycle is easiest way and this is not perf critical - extract(l._odata[odx],buf); + scalar_type * vp = (scalar_type *)&l._odata[odx]; + scalar_type * pt = (scalar_type *)&s; - buf[idx] = s; - - merge(l._odata[odx],buf); + for(int w=0;w Date: Mon, 24 Oct 2016 19:25:15 +0100 Subject: [PATCH 15/27] memory optimisation --- lib/cartesian/Cartesian_base.h | 2 +- lib/cartesian/Cartesian_full.h | 2 +- lib/cartesian/Cartesian_red_black.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h index 8a24c87b..72b21ee3 100644 --- a/lib/cartesian/Cartesian_base.h +++ b/lib/cartesian/Cartesian_base.h @@ -77,7 +77,7 @@ public: // GridCartesian / GridRedBlackCartesian //////////////////////////////////////////////////////////////// virtual int CheckerBoarded(int dim)=0; - virtual int CheckerBoard(std::vector site)=0; + virtual int CheckerBoard(std::vector &site)=0; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h index 14ab8b55..b0d20441 100644 --- a/lib/cartesian/Cartesian_full.h +++ b/lib/cartesian/Cartesian_full.h @@ -49,7 +49,7 @@ public: virtual int CheckerBoarded(int dim){ return 0; } - virtual int CheckerBoard(std::vector site){ + virtual int CheckerBoard(std::vector &site){ return 0; } virtual int CheckerBoardDestination(int cb,int shift,int dim){ diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index db0508d5..6a4300d7 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -49,7 +49,7 @@ public: if( dim==_checker_dim) return 1; else return 0; } - virtual int CheckerBoard(std::vector site){ + virtual int CheckerBoard(std::vector &site){ int linear=0; assert(site.size()==_ndimension); for(int d=0;d<_ndimension;d++){ From 13bf0482e3165514feeccf626d60add2aa91910c Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Mon, 24 Oct 2016 19:25:40 +0100 Subject: [PATCH 16/27] FFT optimisation --- lib/FFT.h | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/lib/FFT.h b/lib/FFT.h index 4cda6483..17060dc3 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -200,18 +200,14 @@ namespace Grid { sign,FFTW_ESTIMATE); } - double add,mul,fma; - FFTW::fftw_flops(p,&add,&mul,&fma); - flops_call = add+mul+2.0*fma; - - GridStopWatch timer; + std::vector lcoor(Nd), gcoor(Nd); // Barrel shift and collect global pencil for(int p=0;plSites();idx++) { - std::vector lcoor(Nd); + sgrid->LocalIndexToLocalCoor(idx,lcoor); sobj s; @@ -228,14 +224,11 @@ namespace Grid { // Loop over orthog coords int NN=pencil_g.lSites(); - - GridStopWatch Timer; - Timer.Start(); + GridStopWatch timer; + timer.Start(); PARALLEL_FOR_LOOP - for(int idx=0;idx lcoor(Nd); + for(int idx=0;idx::fftw_flops(p,&add,&mul,&fma); + flops_call = add+mul+2.0*fma; + usec += timer.useconds(); + flops+= flops_call*NN; int pc = processor_coor[dim]; - for(int idx=0;idxlSites();idx++) { - std::vector lcoor(Nd); + for(int idx=0;idxlSites();idx++) { sgrid->LocalIndexToLocalCoor(idx,lcoor); - std::vector gcoor = lcoor; + gcoor = lcoor; // extract the result sobj s; gcoor[dim] = lcoor[dim]+L*pc; From b94478fa51c46039bfc6beda741790b65dd2c46e Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Mon, 24 Oct 2016 23:45:31 +0100 Subject: [PATCH 17/27] mpi, mpi3, shmem all compile. mpi, mpi3 pass single node multi-rank --- lib/AlignedAllocator.h | 2 +- lib/Stencil.h | 31 ++++++------- lib/communicator/Communicator_base.cc | 12 ++--- lib/communicator/Communicator_mpi3.cc | 24 +++------- lib/communicator/Communicator_none.cc | 4 +- lib/communicator/Communicator_shmem.cc | 61 +++++++------------------- 6 files changed, 49 insertions(+), 85 deletions(-) diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h index 44929ca8..fa001adc 100644 --- a/lib/AlignedAllocator.h +++ b/lib/AlignedAllocator.h @@ -141,7 +141,7 @@ public: if ( bcast != ptr ) { std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout); - BACKTRACEFILE(); + // BACKTRACEFILE(); exit(0); } assert( bcast == (void *) ptr); diff --git a/lib/Stencil.h b/lib/Stencil.h index f22acb6f..221785b4 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -32,8 +32,6 @@ #include // subdir aggregate -const int ShmDirectCopy = 1; - ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient // gather to a point stencil code. CSHIFT is not the best way, so need @@ -170,13 +168,13 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal reqs.resize(Packets.size()); commtime-=usecond(); for(int i=0;iStencilSendToRecvFromBegin(reqs[i], Packets[i].send_buf, Packets[i].to_rank, Packets[i].recv_buf, Packets[i].from_rank, Packets[i].bytes); + /* }else{ _grid->SendToRecvFromBegin(reqs[i], Packets[i].send_buf, @@ -185,17 +183,19 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal Packets[i].from_rank, Packets[i].bytes); } + */ } commtime+=usecond(); } void CommunicateComplete(std::vector > &reqs) { commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); - else - _grid->SendToRecvFromComplete(reqs[i]); + // else + // _grid->SendToRecvFromComplete(reqs[i]); } commtime+=usecond(); } @@ -253,8 +253,6 @@ PARALLEL_FOR_LOOP // Flat vector, change layout for cache friendly. Vector _entries; - inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } - void PrecomputeByteOffsets(void){ for(int i=0;i<_entries.size();i++){ if( _entries[i]._is_local ) { @@ -265,9 +263,7 @@ PARALLEL_FOR_LOOP } }; - inline uint64_t Touch(int ent) { - // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); - } + inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; local = _entries[ent]._is_local; @@ -685,7 +681,9 @@ PARALLEL_FOR_LOOP _grid->StencilBarrier(); HaloGather(source,compress); this->CommunicateBegin(reqs); + _grid->StencilBarrier(); this->CommunicateComplete(reqs); + _grid->StencilBarrier(); CommsMerge(); // spins } @@ -823,11 +821,13 @@ PARALLEL_FOR_LOOP cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); - if ( (ShmDirectCopy==0)||send_buf==NULL ) { - cobj *send_buf = u_send_buf_p; + if ( (send_buf==NULL) ) { + send_buf = u_send_buf_p; } - + // std::cout << " send_bufs "<ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); - if ((ShmDirectCopy==0)||(shm==NULL)) { + // if ((ShmDirectCopy==0)||(shm==NULL)) { + if (shm==NULL) { shm = rp; } diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 1272b6a2..91e9cf9b 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -69,7 +69,7 @@ int CartesianCommunicator::ProcessorCount(void) { return //////////////////////////////////////////////////////////////////////////////// // very VERY rarely (Log, serial RNG) we need world without a grid //////////////////////////////////////////////////////////////////////////////// -int CartesianCommunicator::RankWorld(void) { return WorldRank; }; +int CartesianCommunicator::RankWorld(void){ return WorldRank; }; int CartesianCommunicator::Ranks (void) { return WorldSize; }; int CartesianCommunicator::Nodes (void) { return GroupSize; }; int CartesianCommunicator::Cores (void) { return ShmSize; }; @@ -108,22 +108,22 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector CartesianCommunicator::ShmBufStorageVector; void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; } + void *CartesianCommunicator::ShmBuffer(int rank) { - if (rank != ShmRank ) return NULL; - else return ShmCommBuf; + return NULL; } void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { - if (rank != ShmRank ) return NULL; - else return local_p; + return NULL; } void CartesianCommunicator::ShmInitGeneric(void){ ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); ShmCommBuf=(void *)&ShmBufStorageVector[0]; + std::cout << "allocated persistent buffer"< &lis int from, int bytes) { -#if 1 +#if 0 + this->StencilBarrier(); + MPI_Request xrq; MPI_Request rrq; @@ -440,9 +442,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis PARALLEL_FOR_LOOP for(int w=0;w "<< gdest<<" " <StencilBarrier(); + if (small && (gfrom !=MPI_UNDEFINED) ) { T *ip = (T *)from_ptr; T *op = (T *)recv; PARALLEL_FOR_LOOP for(int w=0;wStencilBarrier(); + #else MPI_Request xrq; MPI_Request rrq; @@ -528,9 +521,6 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector XConnections; +static Vector< HandShake > RConnections; + void CartesianCommunicator::Init(int *argc, char ***argv) { shmem_init(); @@ -62,37 +65,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { RConnections[pe].seq_local = 0; RConnections[pe].seq_remote= 0; } + WorldSize = shmem_n_pes(); + WorldRank = shmem_my_pe(); + ShmRank=0; + ShmSize=1; + GroupRank=WorldRank; + GroupSize=WorldSize; + Slave =0; shmem_barrier_all(); ShmInitGeneric(); } - -// Should error check all MPI calls. -void CartesianCommunicator::Init(int *argc, char ***argv) { - int flag; - MPI_Initialized(&flag); // needed to coexist with other libs apparently - if ( !flag ) { - MPI_Init(argc,argv); - MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); - MPI_Comm_rank(communicator_world,&_WorldRank); - MPI_Comm_size(communicator_world,&_WorldSize); - _ShmRank=0; - _ShmSize=1; - _GroupRank=_WorldRank; - _GroupSize=_WorldSize; - _Slave =0; - } -} - - -typedef struct HandShake_t { - uint64_t seq_local; - uint64_t seq_remote; -} HandShake; - -static Vector< HandShake > XConnections; -static Vector< HandShake > RConnections; - CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { _ndimension = processors.size(); @@ -261,12 +244,9 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, if ( _processor == sender ) { - printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver); // Check he has posted a receive while(SendSeq->seq_remote == SendSeq->seq_local); - printf("Sender receive %d posted\n",sender,receiver); - // Advance our send count seq = ++(SendSeq->seq_local); @@ -275,26 +255,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, shmem_putmem(recv,xmit,bytes,receiver); shmem_fence(); - printf("Sender sent payload %d\n",seq); //Notify him we're done shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver); shmem_fence(); - printf("Sender ringing door bell %d\n",seq); } if ( _processor == receiver ) { - printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver); // Post a receive seq = ++(RecvSeq->seq_local); shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender); - printf("Receiver Opening letter box %d\n",seq); - - // Now wait until he has advanced our reception counter while(RecvSeq->seq_remote != RecvSeq->seq_local); - printf("Receiver Got the mail %d\n",seq); } } From 7c3363b91ecbdb06de96842a610f73697a3de873 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Tue, 25 Oct 2016 00:04:17 +0100 Subject: [PATCH 18/27] Compiles all comms targets --- lib/communicator/Communicator_none.cc | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index f4e4c575..198c1add 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -31,15 +31,6 @@ namespace Grid { /////////////////////////////////////////////////////////////////////////////////////////////////// // Info that is setup once and indept of cartesian layout /////////////////////////////////////////////////////////////////////////////////////////////////// -int CartesianCommunicator::ShmRank; -int CartesianCommunicator::ShmSize; -int CartesianCommunicator::GroupRank; -int CartesianCommunicator::GroupSize; -int CartesianCommunicator::WorldRank; -int CartesianCommunicator::WorldSize; -int CartesianCommunicator::Slave; -void * CartesianCommunicator::ShmCommBuf; -commVector CartesianCommunicator::ShmBufStorageVector; void CartesianCommunicator::Init(int *argc, char *** arv) { From d97a27f4833c6827dc3d344a5186f0a3e4d2d972 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Tue, 25 Oct 2016 01:05:31 +0100 Subject: [PATCH 19/27] Verbose --- lib/communicator/Communicator_base.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 91e9cf9b..bccba6be 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -123,7 +123,6 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { void CartesianCommunicator::ShmInitGeneric(void){ ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); ShmCommBuf=(void *)&ShmBufStorageVector[0]; - std::cout << "allocated persistent buffer"< Date: Tue, 25 Oct 2016 01:05:52 +0100 Subject: [PATCH 20/27] More random bits on parallel seeding --- lib/lattice/Lattice_rng.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/lattice/Lattice_rng.h b/lib/lattice/Lattice_rng.h index 3254af30..2df5e2d1 100644 --- a/lib/lattice/Lattice_rng.h +++ b/lib/lattice/Lattice_rng.h @@ -296,9 +296,10 @@ namespace Grid { _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); int l_idx=generator_idx(o_idx,i_idx); - - std::vector site_seeds(4); - for(int i=0;i<4;i++){ + + const int num_rand_seed=16; + std::vector site_seeds(num_rand_seed); + for(int i=0;i Date: Tue, 25 Oct 2016 01:45:53 +0100 Subject: [PATCH 21/27] Travis fail fix attempt --- lib/Stencil.h | 2 +- lib/communicator/Communicator_base.cc | 3 ++- lib/communicator/Communicator_base.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 221785b4..5c3a5ef9 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -821,7 +821,7 @@ PARALLEL_FOR_LOOP cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); - if ( (send_buf==NULL) ) { + if ( send_buf==NULL ) { send_buf = u_send_buf_p; } // std::cout << " send_bufs "< Date: Tue, 25 Oct 2016 06:01:12 +0100 Subject: [PATCH 22/27] MPI 3 compile on non-linux --- lib/communicator/Communicator_mpi3.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 61e5e06b..14a152ab 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -27,7 +27,6 @@ Author: Peter Boyle /* END LEGAL */ #include "Grid.h" #include -#include namespace Grid { @@ -157,6 +156,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { assert(ierr==0); // KNL hack -- force to numa-domain 1 in flat #if 0 + //#include for(uint64_t page=0;page Date: Tue, 25 Oct 2016 12:56:40 +0100 Subject: [PATCH 23/27] temporary thread safety in FFT --- lib/FFT.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/FFT.h b/lib/FFT.h index 17060dc3..9a59ed01 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -227,7 +227,7 @@ namespace Grid { GridStopWatch timer; timer.Start(); -PARALLEL_FOR_LOOP +//PARALLEL_FOR_LOOP for(int idx=0;idx Date: Tue, 25 Oct 2016 14:21:48 +0100 Subject: [PATCH 24/27] Note:FFT shoud be GridFFT (Not change yet). Gauge fix with FFt is added (tests/core) --- lib/qcd/utils/SUn.h | 31 ++++ lib/qcd/utils/WilsonLoops.h | 2 +- tests/core/Test_fft.cc | 3 + tests/core/Test_fft_gfix.cc | 301 ++++++++++++++++++++++++++++++++++++ 4 files changed, 336 insertions(+), 1 deletion(-) create mode 100644 tests/core/Test_fft_gfix.cc diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h index 3240098a..914c3bad 100644 --- a/lib/qcd/utils/SUn.h +++ b/lib/qcd/utils/SUn.h @@ -674,6 +674,37 @@ class SU { out += la; } } +/* + add GaugeTrans +*/ + +template + static void GaugeTransform( GaugeField &Umu, GaugeMat &g){ + GridBase *grid = Umu._grid; + conformable(grid,g._grid); + + GaugeMat U(grid); + GaugeMat ag(grid); ag = adj(g); + + for(int mu=0;mu(Umu,mu); + U = g*U*Cshift(ag, mu, 1); + PokeIndex(Umu,U,mu); + } + } + template + static void GaugeTransform( std::vector &U, GaugeMat &g){ + GridBase *grid = g._grid; + GaugeMat ag(grid); ag = adj(g); + for(int mu=0;mu + static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g){ + LieRandomize(pRNG,g,1.0); + GaugeTransform(Umu,g); + } // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 ) // inverse operation: FundamentalLieAlgebraMatrix diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h index 10022f50..03d45c07 100644 --- a/lib/qcd/utils/WilsonLoops.h +++ b/lib/qcd/utils/WilsonLoops.h @@ -522,4 +522,4 @@ typedef WilsonLoops SU3WilsonLoops; } } -#endif \ No newline at end of file +#endif diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index 562a53a1..512d6e62 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -441,6 +441,8 @@ int main (int argc, char ** argv) } { + /* + * typedef GaugeImplTypes QEDGimplTypesD; typedef Photon QEDGaction; @@ -450,6 +452,7 @@ int main (int argc, char ** argv) Maxwell.FreePropagator (Source,Prop); std::cout << " MaxwellFree propagator\n"; + */ } Grid_finalize(); } diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc new file mode 100644 index 00000000..1eda9a67 --- /dev/null +++ b/tests/core/Test_fft_gfix.cc @@ -0,0 +1,301 @@ + /************************************************************************************* + + grid` physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_cshift.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace Grid; +using namespace Grid::QCD; + +template +class FourierAcceleratedGaugeFixer : public Gimpl { + public: + INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Gimpl::GaugeLinkField GaugeMat; + typedef typename Gimpl::GaugeField GaugeLorentz; + + static void GaugeLinkToLieAlgebraField(const std::vector &U,std::vector &A) { + for(int mu=0;mu &A,GaugeMat &dmuAmu) { + dmuAmu=zero; + for(int mu=0;mu::avgPlaquette(Umu); + RealD org_link_trace=WilsonLoops::linkTrace(Umu); + RealD old_trace = org_link_trace; + RealD trG; + + std::vector U(Nd,grid); + GaugeMat dmuAmu(grid); + + for(int i=0;i(Umu,mu); + //trG = SteepestDescentStep(U,alpha,dmuAmu); + trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu); + for(int mu=0;mu(Umu,U[mu],mu); + // Monitor progress and convergence test + // infrequently to minimise cost overhead + if ( i %20 == 0 ) { + RealD plaq =WilsonLoops::avgPlaquette(Umu); + RealD link_trace=WilsonLoops::linkTrace(Umu); + + std::cout << GridLogMessage << " Iteration "< &U,RealD & alpha, GaugeMat & dmuAmu) { + GridBase *grid = U[0]._grid; + + std::vector A(Nd,grid); + GaugeMat g(grid); + + GaugeLinkToLieAlgebraField(U,A); + ExpiAlphaDmuAmu(A,g,alpha,dmuAmu); + + + RealD vol = grid->gSites(); + RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc; + + SU::GaugeTransform(U,g); + + return trG; + } + + static RealD FourierAccelSteepestDescentStep(std::vector &U,RealD & alpha, GaugeMat & dmuAmu) { + + GridBase *grid = U[0]._grid; + + RealD vol = grid->gSites(); + + FFT theFFT((GridCartesian *)grid); + + LatticeComplex Fp(grid); + LatticeComplex psq(grid); psq=zero; + LatticeComplex pmu(grid); + LatticeComplex one(grid); one = ComplexD(1.0,0.0); + + GaugeMat g(grid); + GaugeMat dmuAmu_p(grid); + std::vector A(Nd,grid); + + GaugeLinkToLieAlgebraField(U,A); + + DmuAmu(A,dmuAmu); + + theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward); + + ////////////////////////////////// + // Work out Fp = psq_max/ psq... + ////////////////////////////////// + std::vector latt_size = grid->GlobalDimensions(); + std::vector coor(grid->_ndimension,0); + for(int mu=0;mu::taExp(ciadmam,g); + + RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc; + + SU::GaugeTransform(U,g); + + return trG; + } + + static void ExpiAlphaDmuAmu(const std::vector &A,GaugeMat &g,RealD & alpha, GaugeMat &dmuAmu) { + GridBase *grid = g._grid; + ComplexD cialpha(0.0,-alpha); + GaugeMat ciadmam(grid); + DmuAmu(A,dmuAmu); + ciadmam = dmuAmu*cialpha; + SU::taExp(ciadmam,g); + } +/* + //////////////////////////////////////////////////////////////// + // NB The FT for fields living on links has an extra phase in it + // Could add these to the FFT class as a later task since this code + // might be reused elsewhere ???? + //////////////////////////////////////////////////////////////// + static void InverseFourierTransformAmu(FFT &theFFT,const std::vector &Ap,std::vector &Ax) { + GridBase * grid = theFFT.Grid(); + std::vector latt_size = grid->GlobalDimensions(); + + ComplexField pmu(grid); + ComplexField pha(grid); + GaugeMat Apha(grid); + + ComplexD ci(0.0,1.0); + + for(int mu=0;mu &Ax,std::vector &Ap) { + GridBase * grid = theFFT.Grid(); + std::vector latt_size = grid->GlobalDimensions(); + + ComplexField pmu(grid); + ComplexField pha(grid); + ComplexD ci(0.0,1.0); + + // Sign convention for FFTW calls: + // A(x)= Sum_p e^ipx A(p) / V + // A(p)= Sum_p e^-ipx A(x) + + for(int mu=0;mu seeds({1,2,3,4}); + + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout( { vComplexD::Nsimd(),1,1,1}); + std::vector mpi_layout = GridDefaultMpi(); + + int vol = 1; + for(int d=0;d::avgPlaquette(Umu); + std::cout << " Initial plaquette "<::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10); + + + plaq=WilsonLoops::avgPlaquette(Umu); + std::cout << " Final plaquette "< Date: Wed, 26 Oct 2016 17:36:26 +0100 Subject: [PATCH 25/27] more FFT optimisations --- lib/FFT.h | 233 +++++++++++++++++++---------------------- tests/core/Test_fft.cc | 24 +++-- 2 files changed, 121 insertions(+), 136 deletions(-) diff --git a/lib/FFT.h b/lib/FFT.h index 9a59ed01..71cf8a3e 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -98,174 +98,157 @@ namespace Grid { #define FFTW_BACKWARD (+1) #endif - class FFT { + class FFT { private: - + GridCartesian *vgrid; GridCartesian *sgrid; - + int Nd; double flops; double flops_call; uint64_t usec; - + std::vector dimensions; std::vector processors; std::vector processor_coor; - + public: - + static const int forward=FFTW_FORWARD; static const int backward=FFTW_BACKWARD; - + double Flops(void) {return flops;} double MFlops(void) {return flops/usec;} - - FFT ( GridCartesian * grid ) : - vgrid(grid), - Nd(grid->_ndimension), - dimensions(grid->_fdimensions), - processors(grid->_processors), - processor_coor(grid->_processor_coor) + + FFT ( GridCartesian * grid ) : + vgrid(grid), + Nd(grid->_ndimension), + dimensions(grid->_fdimensions), + processors(grid->_processors), + processor_coor(grid->_processor_coor) { flops=0; usec =0; std::vector layout(Nd,1); sgrid = new GridCartesian(dimensions,layout,processors); }; - - ~FFT ( void) { - delete sgrid; + + ~FFT ( void) { + delete sgrid; } template void FFT_dim(Lattice &result,const Lattice &source,int dim, int inverse){ - +#ifndef HAVE_FFTW + assert(0); +#else conformable(result._grid,vgrid); conformable(source._grid,vgrid); - + int L = vgrid->_ldimensions[dim]; int G = vgrid->_fdimensions[dim]; - + std::vector layout(Nd,1); std::vector pencil_gd(vgrid->_fdimensions); - - pencil_gd[dim] = G*processors[dim]; - + + pencil_gd[dim] = G*processors[dim]; + // Pencil global vol LxLxGxLxL per node GridCartesian pencil_g(pencil_gd,layout,processors); - + // Construct pencils typedef typename vobj::scalar_object sobj; typedef typename sobj::scalar_type scalar; + + Lattice pgbuf(&pencil_g); + - Lattice ssource(vgrid); ssource =source; - Lattice pgsource(&pencil_g); - Lattice pgresult(&pencil_g); pgresult=zero; - -#ifndef HAVE_FFTW - assert(0); -#else typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; - - { - int Ncomp = sizeof(sobj)/sizeof(scalar); - int Nlow = 1; - for(int d=0;d_ldimensions[d]; - } - - int rank = 1; /* 1d transforms */ - int n[] = {G}; /* 1d transforms of length G */ - int howmany = Ncomp; - int odist,idist,istride,ostride; - idist = odist = 1; /* Distance between consecutive FT's */ - istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ - int *inembed = n, *onembed = n; - - - int sign = FFTW_FORWARD; - if (inverse) sign = FFTW_BACKWARD; - - FFTW_plan p; - { - FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0]; - FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0]; - p = FFTW::fftw_plan_many_dft(rank,n,howmany, - in,inembed, - istride,idist, - out,onembed, - ostride, odist, - sign,FFTW_ESTIMATE); - } - - std::vector lcoor(Nd), gcoor(Nd); - - // Barrel shift and collect global pencil - for(int p=0;plSites();idx++) { - - - sgrid->LocalIndexToLocalCoor(idx,lcoor); - - sobj s; - - peekLocalSite(s,ssource,lcoor); - - lcoor[dim]+=p*L; - - pokeLocalSite(s,pgsource,lcoor); - } - - ssource = Cshift(ssource,dim,L); - } - - // Loop over orthog coords - int NN=pencil_g.lSites(); - GridStopWatch timer; - timer.Start(); - -//PARALLEL_FOR_LOOP - for(int idx=0;idx::fftw_execute_dft(p,in,out); - } - } - - timer.Stop(); - - double add,mul,fma; - FFTW::fftw_flops(p,&add,&mul,&fma); - flops_call = add+mul+2.0*fma; - usec += timer.useconds(); - flops+= flops_call*NN; - int pc = processor_coor[dim]; - for(int idx=0;idxlSites();idx++) { - sgrid->LocalIndexToLocalCoor(idx,lcoor); - gcoor = lcoor; - // extract the result - sobj s; - gcoor[dim] = lcoor[dim]+L*pc; - peekLocalSite(s,pgresult,gcoor); - pokeLocalSite(s,result,lcoor); - } - - FFTW::fftw_destroy_plan(p); + + int Ncomp = sizeof(sobj)/sizeof(scalar); + int Nlow = 1; + for(int d=0;d_ldimensions[d]; } + + int rank = 1; /* 1d transforms */ + int n[] = {G}; /* 1d transforms of length G */ + int howmany = Ncomp; + int odist,idist,istride,ostride; + idist = odist = 1; /* Distance between consecutive FT's */ + istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ + int *inembed = n, *onembed = n; + + int sign = FFTW_FORWARD; + if (inverse) sign = FFTW_BACKWARD; + + FFTW_plan p; + { + FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0]; + FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0]; + p = FFTW::fftw_plan_many_dft(rank,n,howmany, + in,inembed, + istride,idist, + out,onembed, + ostride, odist, + sign,FFTW_ESTIMATE); + } + + // Barrel shift and collect global pencil + std::vector lcoor(Nd), gcoor(Nd); + result = source; + for(int p=0;plSites();idx++) { + sgrid->LocalIndexToLocalCoor(idx,lcoor); + sobj s; + peekLocalSite(s,result,lcoor); + lcoor[dim]+=p*L; + pokeLocalSite(s,pgbuf,lcoor); + } + result = Cshift(result,dim,L); + } + + // Loop over orthog coords + int NN=pencil_g.lSites(); + GridStopWatch timer; + timer.Start(); + //PARALLEL_FOR_LOOP + for(int idx=0;idx::fftw_execute_dft(p,in,out); + } + } + timer.Stop(); + + // performance counting + double add,mul,fma; + FFTW::fftw_flops(p,&add,&mul,&fma); + flops_call = add+mul+2.0*fma; + usec += timer.useconds(); + flops+= flops_call*NN; + + // writing out result + int pc = processor_coor[dim]; + for(int idx=0;idxlSites();idx++) { + sgrid->LocalIndexToLocalCoor(idx,lcoor); + gcoor = lcoor; + sobj s; + gcoor[dim] = lcoor[dim]+L*pc; + peekLocalSite(s,pgbuf,gcoor); + pokeLocalSite(s,result,lcoor); + } + + // destroying plan + FFTW::fftw_destroy_plan(p); #endif - - } - }; - - } #endif diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index 2fdaeed2..d5b60752 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -76,13 +76,14 @@ int main (int argc, char ** argv) S=zero; S = S+C; + Ctilde = C; FFT theFFT(&Fine); - theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()< Date: Wed, 26 Oct 2016 18:50:07 +0100 Subject: [PATCH 26/27] debug message removed --- lib/communicator/Communicator_base.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc index 9810987d..8b789f4b 100644 --- a/lib/communicator/Communicator_base.cc +++ b/lib/communicator/Communicator_base.cc @@ -48,7 +48,6 @@ void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){ void *ptr = (void *)heap_top; heap_top += bytes; heap_bytes+= bytes; - std::cout <<"Shm alloc "< Date: Sat, 29 Oct 2016 11:04:02 +0100 Subject: [PATCH 27/27] Add missing volume factor in stochastic QED field --- lib/qcd/action/gauge/Photon.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h index 0dd5a9dc..409569d8 100644 --- a/lib/qcd/action/gauge/Photon.h +++ b/lib/qcd/action/gauge/Photon.h @@ -151,12 +151,19 @@ namespace QCD{ { auto *grid = dynamic_cast(out._grid); const unsigned int nd = grid->_ndimension; + std::vector latt_size = grid->_fdimensions; GaugeLinkField sqrtK2Inv(grid), r(grid); GaugeField aTilde(grid); FFT fft(grid); + Integer vol = 1; + for(int d = 0; d < nd; d++) + { + vol = vol * latt_size[d]; + } + invKHatSquared(sqrtK2Inv); - sqrtK2Inv = sqrt(real(sqrtK2Inv)); + sqrtK2Inv = sqrt(vol*real(sqrtK2Inv)); zmSub(sqrtK2Inv); for(int mu = 0; mu < nd; mu++) {