From c190221fd35eba678ec9810aa3b44816085d8283 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Sat, 22 Oct 2016 18:14:27 +0100 Subject: [PATCH] Internal SHM comms in non-simd directions working Need to fix simd directions --- benchmarks/Benchmark_dwf.cc | 8 +- lib/Stencil.h | 1609 +++++++++--------- lib/communicator/Communicator_base.h | 15 +- lib/communicator/Communicator_mpi.cc | 8 + lib/communicator/Communicator_mpi3.cc | 105 +- lib/communicator/Communicator_none.cc | 8 + lib/communicator/Communicator_shmem.cc | 8 + lib/qcd/action/fermion/FermionOperatorImpl.h | 857 +++++----- lib/qcd/action/fermion/WilsonFermion.cc | 8 +- lib/qcd/action/fermion/WilsonFermion5D.cc | 101 +- lib/qcd/action/fermion/WilsonFermion5D.h | 289 ++-- lib/qcd/action/fermion/WilsonKernels.cc | 21 +- lib/qcd/action/fermion/WilsonKernels.h | 275 ++- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 95 +- lib/qcd/action/fermion/WilsonKernelsHand.cc | 55 +- tests/Test_stencil.cc | 6 +- 16 files changed, 1729 insertions(+), 1739 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index a32d7535..f75f0385 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -153,7 +153,7 @@ int main (int argc, char ** argv) std::cout< // subdir aggregate +const int ShmDirectCopy = 1; + ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient // gather to a point stencil code. CSHIFT is not the best way, so need @@ -68,7 +70,7 @@ // ////////////////////////////////////////////////////////////////////////////////////////// - namespace Grid { +namespace Grid { inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table) @@ -117,822 +119,837 @@ PARALLEL_FOR_LOOP } } -template void -Gather_plane_simple_stencil (const Lattice &rhs,cobj *buffer,int dimension,int plane,int cbmask,compressor &compress, int off, - double &t_table ,double & t_data ) -{ - std::vector > table; - Gather_plane_simple_table_compute (rhs._grid,dimension,plane,cbmask,off,table); - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - Gather_plane_simple_table (table,rhs,buffer,compress,off,so); -} +struct StencilEntry { + uint64_t _offset; + uint64_t _byte_offset; + uint16_t _is_local; + uint16_t _permute; + uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline +}; +template +class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. + public: + typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + typedef uint32_t StencilInteger; + typedef typename cobj::vector_type vector_type; + typedef typename cobj::scalar_type scalar_type; + typedef typename cobj::scalar_object scalar_object; + + ////////////////////////////////////////// + // Comms packet queue for asynch thread + ////////////////////////////////////////// + struct Packet { + void * send_buf; + void * recv_buf; + Integer to_rank; + Integer from_rank; + Integer bytes; + }; - struct StencilEntry { - uint64_t _offset; - uint64_t _byte_offset; - uint16_t _is_local; - uint16_t _permute; - uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline - }; + std::vector Packets; - template - class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. - public: + int face_table_computed; + std::vector > > face_table ; + + void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ + Packet p; + p.send_buf = xmit; + p.recv_buf = rcv; + p.to_rank = to; + p.from_rank= from; + p.bytes = bytes; + comms_bytes+=2.0*bytes; + Packets.push_back(p); + } - typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; - typedef uint32_t StencilInteger; - typedef typename cobj::vector_type vector_type; - typedef typename cobj::scalar_type scalar_type; - typedef typename cobj::scalar_object scalar_object; + void CommunicateBegin(std::vector > &reqs) + { + reqs.resize(Packets.size()); + commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes); + }else{ + _grid->SendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes); + } + } + commtime+=usecond(); + } + void CommunicateComplete(std::vector > &reqs) + { + commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); + else + _grid->SendToRecvFromComplete(reqs[i]); + } + commtime+=usecond(); + } - ////////////////////////////////////////// - // Comms packet queue for asynch thread - ////////////////////////////////////////// + /////////////////////////////////////////// + // Simd merge queue for asynch comms + /////////////////////////////////////////// + struct Merge { + cobj * mpointer; + std::vector rpointers; + Integer buffer_size; + Integer packet_id; + }; + + std::vector Mergers; - struct Packet { - void * send_buf; - void * recv_buf; - Integer to_rank; - Integer from_rank; - Integer bytes; - }; + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { + Merge m; + m.mpointer = merge_p; + m.rpointers= rpointers; + m.buffer_size = buffer_size; + m.packet_id = packet_id; + Mergers.push_back(m); + } - std::vector Packets; + void CommsMerge(void ) { - int face_table_computed; - std::vector > > face_table ; - - void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ - Packet p; - p.send_buf = xmit; - p.recv_buf = rcv; - p.to_rank = to; - p.from_rank= from; - p.bytes = bytes; - comms_bytes+=2.0*bytes; - Packets.push_back(p); - } - - void CommunicateBegin(std::vector > &reqs) - { - reqs.resize(Packets.size()); - commtime-=usecond(); - for(int i=0;iSendToRecvFromBegin(reqs[i], - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes); - } - commtime+=usecond(); - } - void CommunicateComplete(std::vector > &reqs) - { - commtime-=usecond(); - for(int i=0;iSendToRecvFromComplete(reqs[i]); - } - commtime+=usecond(); - } - - /////////////////////////////////////////// - // Simd merge queue for asynch comms - /////////////////////////////////////////// - struct Merge { - cobj * mpointer; - std::vector rpointers; - Integer buffer_size; - Integer packet_id; - }; - - std::vector Mergers; - - void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { - Merge m; - m.mpointer = merge_p; - m.rpointers= rpointers; - m.buffer_size = buffer_size; - m.packet_id = packet_id; - Mergers.push_back(m); - } - - void CommsMerge(void ) { - - for(int i=0;i _directions; + std::vector _distances; + std::vector _comm_buf_size; + std::vector _permute_type; + + // npoints x Osites() of these + // Flat vector, change layout for cache friendly. + Vector _entries; + + inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } + + void PrecomputeByteOffsets(void){ + for(int i=0;i<_entries.size();i++){ + if( _entries[i]._is_local ) { + _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); + } else { + _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); + } + } + }; - int _checkerboard; - int _npoints; // Move to template param? - GridBase * _grid; + inline uint64_t Touch(int ent) { + // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); + } + inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { + uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; + local = _entries[ent]._is_local; + perm = _entries[ent]._permute; + if (perm) ptype = _permute_type[point]; + if (local) { + return base + _entries[ent]._byte_offset; + } else { + return cbase + _entries[ent]._byte_offset; + } + } + inline uint64_t GetPFInfo(int ent,uint64_t base) { + uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; + int local = _entries[ent]._is_local; + if (local) return base + _entries[ent]._byte_offset; + else return cbase + _entries[ent]._byte_offset; + } + + /////////////////////////////////////////////////////////// + // Unified Comms buffers for all directions + /////////////////////////////////////////////////////////// + std::vector > u_simd_send_buf; + std::vector > u_simd_recv_buf; + commVector u_send_buf; + commVector u_recv_buf_hide; + cobj* u_recv_buf_p; - // npoints of these - std::vector _directions; - std::vector _distances; - std::vector _comm_buf_size; - std::vector _permute_type; + int u_comm_offset; + int _unified_buffer_size; + + cobj *CommBuf(void) { return u_recv_buf_p; } - // npoints x Osites() of these - // Flat vector, change layout for cache friendly. - Vector _entries; - - inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } - - void PrecomputeByteOffsets(void){ - for(int i=0;i<_entries.size();i++){ - if( _entries[i]._is_local ) { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); - } else { - // PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952 - _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); - } - } - }; - - inline uint64_t Touch(int ent) { - // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); - } - inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { - uint64_t cbase = (uint64_t)&comm_buf[0]; - local = _entries[ent]._is_local; - perm = _entries[ent]._permute; - if (perm) ptype = _permute_type[point]; - if (local) { - return base + _entries[ent]._byte_offset; - } else { - return cbase + _entries[ent]._byte_offset; - } - } - inline uint64_t GetPFInfo(int ent,uint64_t base) { - uint64_t cbase = (uint64_t)&comm_buf[0]; - int local = _entries[ent]._is_local; - if (local) return base + _entries[ent]._byte_offset; - else return cbase + _entries[ent]._byte_offset; - } - - /////////////////////////////////////////////////////////// - // Comms buffers - /////////////////////////////////////////////////////////// - std::vector > u_simd_send_buf; - std::vector > u_simd_recv_buf; - commVector u_send_buf; - commVector comm_buf; - - int u_comm_offset; - int _unified_buffer_size; - - ///////////////////////////////////////// - // Timing info; ugly; possibly temporary - ///////////////////////////////////////// + ///////////////////////////////////////// + // Timing info; ugly; possibly temporary + ///////////////////////////////////////// #define TIMING_HACK #ifdef TIMING_HACK - double jointime; - double gathertime; - double commtime; - double halogtime; - double mergetime; - double spintime; - double comms_bytes; - double gathermtime; - double splicetime; - double nosplicetime; - double t_data; - double t_table; - double calls; - - void ZeroCounters(void) { - gathertime = 0.; - jointime = 0.; - commtime = 0.; - halogtime = 0.; - mergetime = 0.; - spintime = 0.; - gathermtime = 0.; - splicetime = 0.; - nosplicetime = 0.; - t_data = 0.0; - t_table= 0.0; - comms_bytes = 0.; - calls = 0.; - }; - - void Report(void) { + double jointime; + double gathertime; + double commtime; + double halogtime; + double mergetime; + double spintime; + double comms_bytes; + double gathermtime; + double splicetime; + double nosplicetime; + double t_data; + double t_table; + double calls; + + void ZeroCounters(void) { + gathertime = 0.; + jointime = 0.; + commtime = 0.; + halogtime = 0.; + mergetime = 0.; + spintime = 0.; + gathermtime = 0.; + splicetime = 0.; + nosplicetime = 0.; + t_data = 0.0; + t_table= 0.0; + comms_bytes = 0.; + calls = 0.; + }; + + void Report(void) { #define PRINTIT(A) \ std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls< 0. ) { - std::cout << GridLogMessage << " Stencil calls "<1.0){ - PRINTIT(comms_bytes); - PRINTIT(commtime); - std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "< 0. ) { + std::cout << GridLogMessage << " Stencil calls "<1.0){ + PRINTIT(comms_bytes); + PRINTIT(commtime); + std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "< &directions, - const std::vector &distances) - : _permute_type(npoints), _comm_buf_size(npoints) - { - face_table_computed=0; - _npoints = npoints; - _grid = grid; - _directions = directions; - _distances = distances; - _unified_buffer_size=0; - - int osites = _grid->oSites(); - - _entries.resize(_npoints* osites); - for(int ii=0;ii_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - _permute_type[point]=_grid->PermuteType(dimension); - - _checkerboard = checkerboard; - - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); - int rotate_dim = _grid->_simd_layout[dimension]>2; - - assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported - - int sshift[2]; - - // Underlying approach. For each local site build - // up a table containing the npoint "neighbours" and whether they - // live in lattice or a comms buffer. - if ( !comm_dim ) { - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - - if ( sshift[0] == sshift[1] ) { - Local(point,dimension,shift,0x3); - } else { - Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes - Local(point,dimension,shift,0x2);// both with block stride loop iteration - } - } else { // All permute extract done in comms phase prior to Stencil application - // So tables are the same whether comm_dim or splice_dim - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - - if ( sshift[0] == sshift[1] ) { - Comms(point,dimension,shift,0x3); - } else { - Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes - Comms(point,dimension,shift,0x2);// both with block stride loop iteration - } - } - } - u_send_buf.resize(_unified_buffer_size); - comm_buf.resize(_unified_buffer_size); - - PrecomputeByteOffsets(); - - const int Nsimd = grid->Nsimd(); - u_simd_send_buf.resize(Nsimd); - u_simd_recv_buf.resize(Nsimd); - for(int l=0;l_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int gd = _grid->_gdimensions[dimension]; - int ly = _grid->_simd_layout[dimension]; - - // Map to always positive shift modulo global full dimension. - int shift = (shiftpm+fd)%fd; - - // the permute type - int permute_dim =_grid->PermuteDim(dimension); - - for(int x=0;x_ostride[dimension]; - - int cb= (cbmask==0x2)? Odd : Even; - - int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); - int sx = (x+sshift)%rd; - - int wraparound=0; - if ( (shiftpm==-1) && (sx>x) ) { - wraparound = 1; - } - if ( (shiftpm== 1) && (sxNsimd(); - - int fd = _grid->_fdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int pd = _grid->_processors[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - - assert(comm_dim==1); - int shift = (shiftpm + fd) %fd; - assert(shift>=0); - assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored - - _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and - // send to one or more remote nodes. - - int cb= (cbmask==0x2)? Odd : Even; - int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); - - for(int x=0;xPermuteType(dimension); - - int sx = (x+sshift)%rd; - - int offnode = 0; - if ( simd_layout > 1 ) { - - for(int i=0;i>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - if ( nbr_proc ) { - offnode =1; - } - } - - } else { - int comm_proc = ((x+sshift)/rd)%pd; - offnode = (comm_proc!= 0); - } - - - int wraparound=0; - if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { - wraparound = 1; - } - if ( (shiftpm== 1) && (sx_processor_coor[dimension]==grid->_processors[dimension]-1) ) { - wraparound = 1; - } - if (!offnode) { - - int permute_slice=0; - CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); - - } else { - - int words = buffer_size; - if (cbmask != 0x3) words=words>>1; - - int rank = grid->_processor; - int recv_from_rank; - int xmit_to_rank; - - int unified_buffer_offset = _unified_buffer_size; - _unified_buffer_size += words; - - ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase - - } - } - } - // Routine builds up integer table for each site in _offsets, _is_local, _permute - void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap) - { - int rd = _grid->_rdimensions[dimension]; - - if ( !_grid->CheckerBoarded(dimension) ) { - - int o = 0; // relative offset to base within plane - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*_grid->_ostride[dimension]; // offset in buffer - - // Simple block stride gather of SIMD objects - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - int idx=point+(lo+o+b)*_npoints; - _entries[idx]._offset =ro+o+b; - _entries[idx]._permute=permute; - _entries[idx]._is_local=1; - _entries[idx]._around_the_world=wrap; - } - o +=_grid->_slice_stride[dimension]; - } - - } else { - - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - - int ocb=1<<_grid->CheckerBoardFromOindex(o+b); - - if ( ocb&cbmask ) { - int idx = point+(lo+o+b)*_npoints; - _entries[idx]._offset =ro+o+b; - _entries[idx]._is_local=1; - _entries[idx]._permute=permute; - _entries[idx]._around_the_world=wrap; - } - - } - o +=_grid->_slice_stride[dimension]; - } - - } - } - // Routine builds up integer table for each site in _offsets, _is_local, _permute - void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap) - { - int rd = _grid->_rdimensions[dimension]; - - if ( !_grid->CheckerBoarded(dimension) ) { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - // Simple block stride gather of SIMD objects - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - int idx=point+(so+o+b)*_npoints; - _entries[idx]._offset =offset+(bo++); - _entries[idx]._is_local=0; - _entries[idx]._permute=0; - _entries[idx]._around_the_world=wrap; - } - o +=_grid->_slice_stride[dimension]; - } - - } else { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - - int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup - if ( ocb & cbmask ) { - int idx = point+(so+o+b)*_npoints; - _entries[idx]._offset =offset+(bo++); - _entries[idx]._is_local=0; - _entries[idx]._permute =0; - _entries[idx]._around_the_world=wrap; - } - } - o +=_grid->_slice_stride[dimension]; - } - } - } - - - - template - void HaloExchange(const Lattice &source,compressor &compress) - { - std::vector > reqs; - calls++; - Mergers.resize(0); - Packets.resize(0); - HaloGather(source,compress); - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); - CommsMerge(); // spins - } - - template - void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) - { - int dimension = _directions[point]; - int displacement = _distances[point]; - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - - - // Map to always positive shift modulo global full dimension. - int shift = (displacement+fd)%fd; - - // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); - assert (source.checkerboard== _checkerboard); - - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); - - // Gather phase - int sshift [2]; - if ( comm_dim ) { - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - if ( sshift[0] == sshift[1] ) { - if (splice_dim) { - splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x3,compress,face_idx); - splicetime+=usecond(); - } else { - nosplicetime-=usecond(); - Gather(source,dimension,shift,0x3,compress,face_idx); - nosplicetime+=usecond(); - } - } else { - if(splice_dim){ - splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes - GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration - splicetime+=usecond(); - } else { - nosplicetime-=usecond(); - Gather(source,dimension,shift,0x1,compress,face_idx); - Gather(source,dimension,shift,0x2,compress,face_idx); - nosplicetime+=usecond(); - } - } - } - } - - template - void HaloGather(const Lattice &source,compressor &compress) - { - // conformable(source._grid,_grid); - assert(source._grid==_grid); - halogtime-=usecond(); - - assert (comm_buf.size() == _unified_buffer_size ); - u_comm_offset=0; - - // Gather all comms buffers - int face_idx=0; - for(int point = 0 ; point < _npoints; point++) { - compress.Point(point); - HaloGatherDir(source,compress,point,face_idx); - } - face_table_computed=1; - - assert(u_comm_offset==_unified_buffer_size); - halogtime+=usecond(); - } - - template - void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) - { - typedef typename cobj::vector_type vector_type; - typedef typename cobj::scalar_type scalar_type; - - GridBase *grid=_grid; - assert(rhs._grid==_grid); - // conformable(_grid,rhs._grid); - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int pd = _grid->_processors[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - assert(simd_layout==1); - assert(comm_dim==1); - assert(shift>=0); - assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; - - int cb= (cbmask==0x2)? Odd : Even; - int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); - - for(int x=0;x>1; - - int bytes = words * sizeof(cobj); - - gathertime-=usecond(); - int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane - if ( !face_table_computed ) { - t_table-=usecond(); - face_table.resize(face_idx+1); - cobj *ptr; ptr = &u_send_buf[0]; - Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset, - face_table[face_idx]); - t_table+=usecond(); - } - t_data-=usecond(); - Gather_plane_simple_table (face_table[face_idx],rhs,&u_send_buf[0],compress,u_comm_offset,so); face_idx++; - t_data+=usecond(); - gathertime+=usecond(); - - int rank = _grid->_processor; - int recv_from_rank; - int xmit_to_rank; - _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - assert (xmit_to_rank != _grid->ThisRank()); - assert (recv_from_rank != _grid->ThisRank()); - - // FIXME Implement asynchronous send & also avoid buffer copy - AddPacket((void *)&u_send_buf[u_comm_offset], - (void *) &comm_buf[u_comm_offset], - xmit_to_rank, - recv_from_rank, - bytes); - - u_comm_offset+=words; - } - } - } - - - template - void GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) - { - const int Nsimd = _grid->Nsimd(); - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int pd = _grid->_processors[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - - assert(comm_dim==1); - // This will not work with a rotate dim - assert(simd_layout==2); - assert(shift>=0); - assert(shiftPermuteType(dimension); - - /////////////////////////////////////////////// - // Simd direction uses an extract/merge pair - /////////////////////////////////////////////// - int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; - int words = sizeof(cobj)/sizeof(vector_type); - - assert(cbmask==0x3); // Fixme think there is a latent bug if not true - - int bytes = buffer_size*sizeof(scalar_object); - - std::vector rpointers(Nsimd); - std::vector spointers(Nsimd); - - /////////////////////////////////////////// - // Work out what to send where - /////////////////////////////////////////// - - int cb = (cbmask==0x2)? Odd : Even; - int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); - - // loop over outer coord planes orthog to dim - for(int x=0;x= rd ); - - if ( any_offnode ) { - - for(int i=0;i(rhs,spointers,dimension,sx,cbmask,compress); - gathermtime+=usecond(); - - for(int i=0;i2 - // std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - int nbr_lcoor= (nbr_coor%ld); - int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer - int nbr_ox = (nbr_lcoor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - if (nbr_ic) nbr_lane|=inner_bit; - assert (sx == nbr_ox); - - auto rp = &u_simd_recv_buf[i ][u_comm_offset]; - auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; - - void *vrp = (void *)rp; - void *vsp = (void *)sp; - - - if(nbr_proc){ - - int recv_from_rank; - int xmit_to_rank; - - _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes); - - rpointers[i] = rp; - - } else { - - rpointers[i] = sp; - - } - } - - AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1); - - u_comm_offset +=buffer_size; - } - } - } - - }; - } + CartesianStencil(GridBase *grid, + int npoints, + int checkerboard, + const std::vector &directions, + const std::vector &distances) + : _permute_type(npoints), _comm_buf_size(npoints) + { + face_table_computed=0; + _npoints = npoints; + _grid = grid; + _directions = directions; + _distances = distances; + _unified_buffer_size=0; + + int osites = _grid->oSites(); + + _entries.resize(_npoints* osites); + for(int ii=0;ii_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + _permute_type[point]=_grid->PermuteType(dimension); + + _checkerboard = checkerboard; + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + int rotate_dim = _grid->_simd_layout[dimension]>2; + + assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported + + int sshift[2]; + + // Underlying approach. For each local site build + // up a table containing the npoint "neighbours" and whether they + // live in lattice or a comms buffer. + if ( !comm_dim ) { + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + + if ( sshift[0] == sshift[1] ) { + Local(point,dimension,shift,0x3); + } else { + Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Local(point,dimension,shift,0x2);// both with block stride loop iteration + } + } else { // All permute extract done in comms phase prior to Stencil application + // So tables are the same whether comm_dim or splice_dim + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + + if ( sshift[0] == sshift[1] ) { + Comms(point,dimension,shift,0x3); + } else { + Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Comms(point,dimension,shift,0x2);// both with block stride loop iteration + } + } + } + u_send_buf.resize(_unified_buffer_size); + + ///////////////////////////////////////////////////////////////////////////////// + // Try to allocate for receiving in a shared memory region, fall back to buffer + ///////////////////////////////////////////////////////////////////////////////// + if( ShmDirectCopy ) { + + u_recv_buf_p=(cobj *)_grid->ShmBufferSelf(); + if ( u_recv_buf_p == NULL ) { + u_recv_buf_hide.resize(_unified_buffer_size); + u_recv_buf_p=&u_recv_buf_hide[0]; + } + } else { + u_recv_buf_hide.resize(_unified_buffer_size); + u_recv_buf_p=&u_recv_buf_hide[0]; + } + + PrecomputeByteOffsets(); + + const int Nsimd = grid->Nsimd(); + u_simd_send_buf.resize(Nsimd); + u_simd_recv_buf.resize(Nsimd); + for(int l=0;l_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int gd = _grid->_gdimensions[dimension]; + int ly = _grid->_simd_layout[dimension]; + + // Map to always positive shift modulo global full dimension. + int shift = (shiftpm+fd)%fd; + + // the permute type + int permute_dim =_grid->PermuteDim(dimension); + + for(int x=0;x_ostride[dimension]; + + int cb= (cbmask==0x2)? Odd : Even; + + int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); + int sx = (x+sshift)%rd; + + int wraparound=0; + if ( (shiftpm==-1) && (sx>x) ) { + wraparound = 1; + } + if ( (shiftpm== 1) && (sxNsimd(); + + int fd = _grid->_fdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int pd = _grid->_processors[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(comm_dim==1); + int shift = (shiftpm + fd) %fd; + assert(shift>=0); + assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored + + _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and + // send to one or more remote nodes. + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); + + for(int x=0;xPermuteType(dimension); + + int sx = (x+sshift)%rd; + + int offnode = 0; + if ( simd_layout > 1 ) { + + for(int i=0;i>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + + if ( nbr_proc ) { + offnode =1; + } + } + + } else { + int comm_proc = ((x+sshift)/rd)%pd; + offnode = (comm_proc!= 0); + } + + + int wraparound=0; + if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { + wraparound = 1; + } + if ( (shiftpm== 1) && (sx_processor_coor[dimension]==grid->_processors[dimension]-1) ) { + wraparound = 1; + } + if (!offnode) { + + int permute_slice=0; + CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); + + } else { + + int words = buffer_size; + if (cbmask != 0x3) words=words>>1; + + int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + + int unified_buffer_offset = _unified_buffer_size; + _unified_buffer_size += words; + + ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase + + } + } + } + // Routine builds up integer table for each site in _offsets, _is_local, _permute + void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap) + { + int rd = _grid->_rdimensions[dimension]; + + if ( !_grid->CheckerBoarded(dimension) ) { + + int o = 0; // relative offset to base within plane + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // offset in buffer + + // Simple block stride gather of SIMD objects + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + int idx=point+(lo+o+b)*_npoints; + _entries[idx]._offset =ro+o+b; + _entries[idx]._permute=permute; + _entries[idx]._is_local=1; + _entries[idx]._around_the_world=wrap; + } + o +=_grid->_slice_stride[dimension]; + } + + } else { + + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + + int ocb=1<<_grid->CheckerBoardFromOindex(o+b); + + if ( ocb&cbmask ) { + int idx = point+(lo+o+b)*_npoints; + _entries[idx]._offset =ro+o+b; + _entries[idx]._is_local=1; + _entries[idx]._permute=permute; + _entries[idx]._around_the_world=wrap; + } + + } + o +=_grid->_slice_stride[dimension]; + } + + } + } + // Routine builds up integer table for each site in _offsets, _is_local, _permute + void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap) + { + int rd = _grid->_rdimensions[dimension]; + + if ( !_grid->CheckerBoarded(dimension) ) { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer + + // Simple block stride gather of SIMD objects + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + int idx=point+(so+o+b)*_npoints; + _entries[idx]._offset =offset+(bo++); + _entries[idx]._is_local=0; + _entries[idx]._permute=0; + _entries[idx]._around_the_world=wrap; + } + o +=_grid->_slice_stride[dimension]; + } + + } else { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer + + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + + int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup + if ( ocb & cbmask ) { + int idx = point+(so+o+b)*_npoints; + _entries[idx]._offset =offset+(bo++); + _entries[idx]._is_local=0; + _entries[idx]._permute =0; + _entries[idx]._around_the_world=wrap; + } + } + o +=_grid->_slice_stride[dimension]; + } + } + } + + template void HaloExchange(const Lattice &source,compressor &compress) + { + std::vector > reqs; + calls++; + Mergers.resize(0); + Packets.resize(0); + HaloGather(source,compress); + this->CommunicateBegin(reqs); + this->CommunicateComplete(reqs); + CommsMerge(); // spins + } + + template void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) + { + int dimension = _directions[point]; + int displacement = _distances[point]; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + + + // Map to always positive shift modulo global full dimension. + int shift = (displacement+fd)%fd; + + // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); + assert (source.checkerboard== _checkerboard); + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + // Gather phase + int sshift [2]; + if ( comm_dim ) { + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + if ( sshift[0] == sshift[1] ) { + if (splice_dim) { + splicetime-=usecond(); + GatherSimd(source,dimension,shift,0x3,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + Gather(source,dimension,shift,0x3,compress,face_idx); + nosplicetime+=usecond(); + } + } else { + if(splice_dim){ + splicetime-=usecond(); + GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes + GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + Gather(source,dimension,shift,0x1,compress,face_idx); + Gather(source,dimension,shift,0x2,compress,face_idx); + nosplicetime+=usecond(); + } + } + } + } + + template + void HaloGather(const Lattice &source,compressor &compress) + { + // conformable(source._grid,_grid); + assert(source._grid==_grid); + halogtime-=usecond(); + + u_comm_offset=0; + + // Gather all comms buffers + int face_idx=0; + for(int point = 0 ; point < _npoints; point++) { + compress.Point(point); + HaloGatherDir(source,compress,point,face_idx); + } + face_table_computed=1; + + assert(u_comm_offset==_unified_buffer_size); + halogtime+=usecond(); + } + + template + void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) + { + typedef typename cobj::vector_type vector_type; + typedef typename cobj::scalar_type scalar_type; + + GridBase *grid=_grid; + assert(rhs._grid==_grid); + // conformable(_grid,rhs._grid); + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int pd = _grid->_processors[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + + for(int x=0;x>1; + + int bytes = words * sizeof(cobj); + + gathertime-=usecond(); + int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane + if ( !face_table_computed ) { + t_table-=usecond(); + face_table.resize(face_idx+1); + Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset, + face_table[face_idx]); + t_table+=usecond(); + } + + + int rank = _grid->_processor; + int recv_from_rank; + int xmit_to_rank; + _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + + assert (xmit_to_rank != _grid->ThisRank()); + assert (recv_from_rank != _grid->ThisRank()); + + ///////////////////////////////////////////////////////// + // try the direct copy if possible + ///////////////////////////////////////////////////////// + + cobj *u_send_buf_p = &u_send_buf[0]; + if (ShmDirectCopy) { + cobj *shm = (cobj *) _grid->ShmBuffer(xmit_to_rank); + if ( shm!=NULL) { + u_send_buf_p = shm; + } + } + + t_data-=usecond(); + Gather_plane_simple_table (face_table[face_idx],rhs,u_send_buf_p,compress,u_comm_offset,so); face_idx++; + t_data+=usecond(); + + AddPacket((void *)&u_send_buf_p[u_comm_offset], + (void *)&u_recv_buf_p[u_comm_offset], + xmit_to_rank, + recv_from_rank, + bytes); + + gathertime+=usecond(); + u_comm_offset+=words; + } + } + } + + template + void GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) + { + const int Nsimd = _grid->Nsimd(); + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int pd = _grid->_processors[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(comm_dim==1); + // This will not work with a rotate dim + assert(simd_layout==2); + assert(shift>=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; + int words = sizeof(cobj)/sizeof(vector_type); + + assert(cbmask==0x3); // Fixme think there is a latent bug if not true + + int bytes = buffer_size*sizeof(scalar_object); + + std::vector rpointers(Nsimd); + std::vector spointers(Nsimd); + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + + int cb = (cbmask==0x2)? Odd : Even; + int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + + // loop over outer coord planes orthog to dim + for(int x=0;x= rd ); + + if ( any_offnode ) { + + for(int i=0;i(rhs,spointers,dimension,sx,cbmask,compress); + gathermtime+=usecond(); + + for(int i=0;i2 + // std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + int nbr_lcoor= (nbr_coor%ld); + int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer + int nbr_ox = (nbr_lcoor%rd); // outer coord of peer + int nbr_lane = (i&(~inner_bit)); + + if (nbr_ic) nbr_lane|=inner_bit; + assert (sx == nbr_ox); + + auto rp = &u_simd_recv_buf[i ][u_comm_offset]; + auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; + + if(nbr_proc){ + + int recv_from_rank; + int xmit_to_rank; + + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); + + rpointers[i] = rp; + + } else { + + rpointers[i] = sp; + + } + } + + assert(0); + AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1); + + u_comm_offset +=buffer_size; + } + } + } + +}; +} #endif diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index f2a6bf00..4139f72a 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -80,7 +80,6 @@ class CartesianCommunicator { void * ShmCommBuf; std::vector ShmCommBufs; - std::vector ShmStencilBufs; int WorldRank; int WorldSize; @@ -105,6 +104,10 @@ class CartesianCommunicator { int RankFromProcessorCoor(std::vector &coor); void ProcessorCoorFromRank(int rank,std::vector &coor); + // Helper function for SHM Windows in MPI3 + void *ShmBufferSelf(void); + void *ShmBuffer(int rank); + ///////////////////////////////// // Grid information queries ///////////////////////////////// @@ -173,6 +176,16 @@ class CartesianCommunicator { int recv_from_rank, int bytes); void SendToRecvFromComplete(std::vector &waitall); + void StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes); + void StencilSendToRecvFromComplete(std::vector &waitall) + { + SendToRecvFromComplete(waitall); + } //////////////////////////////////////////////////////////// // Barrier diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 9c66202f..4291b319 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -67,6 +67,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) assert(Size==_Nprocessors); } +void *CartesianCommunicator::ShmBufferSelf(void) +{ + return NULL; +} +void *CartesianCommunicator::ShmBuffer(int rank) +{ + return NULL; +} void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index be4e66a9..f5bbdbda 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -197,10 +197,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Verbose for now ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - std::cout<< "Ranks per node "<< ShmSize << std::endl; - std::cout<< "Nodes "<< GroupSize << std::endl; - std::cout<< "Ranks "<< WorldSize << std::endl; - std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl; + std::cout< &processors) } ShmCommBufs.resize(ShmSize); - ShmStencilBufs.resize(ShmSize); for(int r=0;r &processors) ShmCoor.resize(_ndimension); GroupCoor.resize(_ndimension); WorldCoor.resize(_ndimension); + for(int l2=0;l2 &list, void *xmit, @@ -355,13 +369,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis int from, int bytes) { -#undef SHM_USE_BCOPY MPI_Request xrq; MPI_Request rrq; static int sequence; - int rank = _processor; int ierr; int tag; int check; @@ -370,6 +382,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis assert(from != _processor); int gdest = GroupRanks[dest]; + int gfrom = GroupRanks[from]; int gme = GroupRanks[_processor]; sequence++; @@ -379,30 +392,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis int small = (bytes &lis list.push_back(xrq); } - // std::cout << "Syncing "< &lis assert(ierr==0); list.push_back(rrq); } - - // std::cout << "Syncing"< &list, + void *xmit, + int dest, + void *recv, + int from, + int bytes) +{ MPI_Request xrq; MPI_Request rrq; - int rank = _processor; - int ierr; - ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); - ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); - - assert(ierr==0); - list.push_back(xrq); - list.push_back(rrq); -#endif + int ierr; + + assert(dest != _processor); + assert(from != _processor); + + int gdest = GroupRanks[dest]; + int gfrom = GroupRanks[from]; + int gme = GroupRanks[_processor]; + + assert(gme == ShmRank); + + if ( gdest == MPI_UNDEFINED ) { + ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); + assert(ierr==0); + list.push_back(xrq); + } + + if ( gfrom ==MPI_UNDEFINED) { + ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); + assert(ierr==0); + list.push_back(rrq); + } + + MPI_Win_sync (ShmWindow); + MPI_Barrier (ShmComm); + MPI_Win_sync (ShmWindow); + } + void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) { int nreq=list.size(); diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index 8601255a..80b8fb90 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -33,6 +33,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv) } int Rank(void ){ return 0; }; +void *CartesianCommunicator::ShmBufferSelf(void) +{ + return NULL; +} +void *CartesianCommunicator::ShmBuffer(int rank) +{ + return NULL; +} CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc index 091e266e..4af719b0 100644 --- a/lib/communicator/Communicator_shmem.cc +++ b/lib/communicator/Communicator_shmem.cc @@ -50,6 +50,14 @@ typedef struct HandShake_t { static Vector< HandShake > XConnections; static Vector< HandShake > RConnections; +void *CartesianCommunicator::ShmBufferSelf(void) +{ + return NULL; +} +void *CartesianCommunicator::ShmBuffer(int rank) +{ + return NULL; +} void CartesianCommunicator::Init(int *argc, char ***argv) { shmem_init(); XConnections.resize(shmem_n_pes()); diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 4248ec4c..0800dea6 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -33,511 +33,500 @@ directory #define GRID_QCD_FERMION_OPERATOR_IMPL_H namespace Grid { - - namespace QCD { +namespace QCD { - ////////////////////////////////////////////// - // Template parameter class constructs to package - // externally control Fermion implementations - // in orthogonal directions - // - // Ultimately need Impl to always define types where XXX is opaque - // - // typedef typename XXX Simd; - // typedef typename XXX GaugeLinkField; - // typedef typename XXX GaugeField; - // typedef typename XXX GaugeActField; - // typedef typename XXX FermionField; - // typedef typename XXX DoubledGaugeField; - // typedef typename XXX SiteSpinor; - // typedef typename XXX SiteHalfSpinor; - // typedef typename XXX Compressor; - // - // and Methods: - // void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) - // void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) - // void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St) - // void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) - // void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) - // - // - // To acquire the typedefs from "Base" (either a base class or template param) use: - // - // INHERIT_GIMPL_TYPES(Base) - // INHERIT_FIMPL_TYPES(Base) - // INHERIT_IMPL_TYPES(Base) - // - // The Fermion operators will do the following: - // - // struct MyOpParams { - // RealD mass; - // }; - // - // - // template - // class MyOp : public { - // public: - // - // INHERIT_ALL_IMPL_TYPES(Impl); - // - // MyOp(MyOpParams Myparm, ImplParams &ImplParam) : Impl(ImplParam) - // { - // - // }; - // - // } - ////////////////////////////////////////////// - - - //////////////////////////////////////////////////////////////////////// - // Implementation dependent fermion types - //////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////// + // Template parameter class constructs to package + // externally control Fermion implementations + // in orthogonal directions + // + // Ultimately need Impl to always define types where XXX is opaque + // + // typedef typename XXX Simd; + // typedef typename XXX GaugeLinkField; + // typedef typename XXX GaugeField; + // typedef typename XXX GaugeActField; + // typedef typename XXX FermionField; + // typedef typename XXX DoubledGaugeField; + // typedef typename XXX SiteSpinor; + // typedef typename XXX SiteHalfSpinor; + // typedef typename XXX Compressor; + // + // and Methods: + // void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) + // void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) + // void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St) + // void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) + // void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) + // + // + // To acquire the typedefs from "Base" (either a base class or template param) use: + // + // INHERIT_GIMPL_TYPES(Base) + // INHERIT_FIMPL_TYPES(Base) + // INHERIT_IMPL_TYPES(Base) + // + // The Fermion operators will do the following: + // + // struct MyOpParams { + // RealD mass; + // }; + // + // + // template + // class MyOp : public { + // public: + // + // INHERIT_ALL_IMPL_TYPES(Impl); + // + // MyOp(MyOpParams Myparm, ImplParams &ImplParam) : Impl(ImplParam) + // { + // + // }; + // + // } + ////////////////////////////////////////////// + + //////////////////////////////////////////////////////////////////////// + // Implementation dependent fermion types + //////////////////////////////////////////////////////////////////////// + #define INHERIT_FIMPL_TYPES(Impl)\ - typedef typename Impl::FermionField FermionField; \ - typedef typename Impl::DoubledGaugeField DoubledGaugeField; \ - typedef typename Impl::SiteSpinor SiteSpinor; \ - typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \ - typedef typename Impl::Compressor Compressor; \ - typedef typename Impl::StencilImpl StencilImpl; \ - typedef typename Impl::ImplParams ImplParams; \ - typedef typename Impl::Coeff_t Coeff_t; - + typedef typename Impl::FermionField FermionField; \ + typedef typename Impl::DoubledGaugeField DoubledGaugeField; \ + typedef typename Impl::SiteSpinor SiteSpinor; \ + typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \ + typedef typename Impl::Compressor Compressor; \ + typedef typename Impl::StencilImpl StencilImpl; \ + typedef typename Impl::ImplParams ImplParams; \ + typedef typename Impl::Coeff_t Coeff_t; + #define INHERIT_IMPL_TYPES(Base) \ - INHERIT_GIMPL_TYPES(Base) \ - INHERIT_FIMPL_TYPES(Base) + INHERIT_GIMPL_TYPES(Base) \ + INHERIT_FIMPL_TYPES(Base) + + ///////////////////////////////////////////////////////////////////////////// + // Single flavour four spinors with colour index + ///////////////////////////////////////////////////////////////////////////// + template + class WilsonImpl : public PeriodicGaugeImpl > { + + public: + + static const int Dimension = Representation::Dimension; + typedef PeriodicGaugeImpl > Gimpl; + + //Necessary? + constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - /////// - // Single flavour four spinors with colour index - /////// - template - class WilsonImpl - : public PeriodicGaugeImpl > { - public: - static const int Dimension = Representation::Dimension; - typedef PeriodicGaugeImpl > Gimpl; - - //Necessary? - constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} + const bool LsVectorised=false; + typedef _Coeff_t Coeff_t; - const bool LsVectorised=false; - typedef _Coeff_t Coeff_t; - - - INHERIT_GIMPL_TYPES(Gimpl); + INHERIT_GIMPL_TYPES(Gimpl); - template using iImplSpinor = iScalar, Ns> >; - template using iImplHalfSpinor = iScalar, Nhs> >; - template using iImplDoubledGaugeField = iVector >, Nds>; + template using iImplSpinor = iScalar, Ns> >; + template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplDoubledGaugeField = iVector >, Nds>; + + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplDoubledGaugeField SiteDoubledGaugeField; + + typedef Lattice FermionField; + typedef Lattice DoubledGaugeField; + + typedef WilsonCompressor Compressor; + typedef WilsonImplParams ImplParams; + typedef WilsonStencil StencilImpl; + + ImplParams Params; + + WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; - typedef iImplSpinor SiteSpinor; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef iImplDoubledGaugeField SiteDoubledGaugeField; + bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - typedef Lattice FermionField; - typedef Lattice DoubledGaugeField; + inline void multLink(SiteHalfSpinor &phi, + const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, + int mu, + StencilEntry *SE, + StencilImpl &St) { + mult(&phi(), &U(mu), &chi()); + } - typedef WilsonCompressor Compressor; - typedef WilsonImplParams ImplParams; - typedef WilsonStencil StencilImpl; + template + inline void loadLinkElement(Simd ®, ref &memory) { + reg = memory; + } - ImplParams Params; - - WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; - - bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - - inline void multLink(SiteHalfSpinor &phi, - const SiteDoubledGaugeField &U, - const SiteHalfSpinor &chi, - int mu, - StencilEntry *SE, - StencilImpl &St) { - mult(&phi(), &U(mu), &chi()); + inline void DoubleStore(GridBase *GaugeGrid, + DoubledGaugeField &Uds, + const GaugeField &Umu) { + conformable(Uds._grid, GaugeGrid); + conformable(Umu._grid, GaugeGrid); + GaugeLinkField U(GaugeGrid); + for (int mu = 0; mu < Nd; mu++) { + U = PeekIndex(Umu, mu); + PokeIndex(Uds, U, mu); + U = adj(Cshift(U, mu, -1)); + PokeIndex(Uds, U, mu + 4); } + } + + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ + GaugeLinkField link(mat._grid); + link = TraceIndex(outerProduct(Btilde,A)); + PokeIndex(mat,link,mu); + } - template - inline void loadLinkElement(Simd ®, - ref &memory) { - reg = memory; - } + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ - inline void DoubleStore(GridBase *GaugeGrid, - DoubledGaugeField &Uds, - const GaugeField &Umu) { - conformable(Uds._grid, GaugeGrid); - conformable(Umu._grid, GaugeGrid); - GaugeLinkField U(GaugeGrid); - for (int mu = 0; mu < Nd; mu++) { - U = PeekIndex(Umu, mu); - PokeIndex(Uds, U, mu); - U = adj(Cshift(U, mu, -1)); - PokeIndex(Uds, U, mu + 4); + int Ls=Btilde._grid->_fdimensions[0]; + GaugeLinkField tmp(mat._grid); + tmp = zero; + + PARALLEL_FOR_LOOP + for(int sss=0;sssoSites();sss++){ + int sU=sss; + for(int s=0;s(outerProduct(Btilde[sF],Atilde[sF])); // ordering here } } - - inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ - GaugeLinkField link(mat._grid); - link = TraceIndex(outerProduct(Btilde,A)); - PokeIndex(mat,link,mu); - } + PokeIndex(mat,tmp,mu); - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ - - int Ls=Btilde._grid->_fdimensions[0]; - GaugeLinkField tmp(mat._grid); - tmp = zero; + } + }; - PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - int sU=sss; - for(int s=0;s(outerProduct(Btilde[sF],Atilde[sF])); // ordering here - } - } - PokeIndex(mat,tmp,mu); + //////////////////////////////////////////////////////////////////////////////////// + // Single flavour four spinors with colour index, 5d redblack + //////////////////////////////////////////////////////////////////////////////////// +template +class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { + public: + + static const int Dimension = Nrepresentation; + const bool LsVectorised=true; + typedef _Coeff_t Coeff_t; + typedef PeriodicGaugeImpl > Gimpl; + + INHERIT_GIMPL_TYPES(Gimpl); + + template using iImplSpinor = iScalar, Ns> >; + template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplDoubledGaugeField = iVector >, Nds>; + template using iImplGaugeField = iVector >, Nd>; + template using iImplGaugeLink = iScalar > >; + + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef Lattice FermionField; + + // Make the doubled gauge field a *scalar* + typedef iImplDoubledGaugeField SiteDoubledGaugeField; // This is a scalar + typedef iImplGaugeField SiteScalarGaugeField; // scalar + typedef iImplGaugeLink SiteScalarGaugeLink; // scalar + + typedef Lattice DoubledGaugeField; + + typedef WilsonCompressor Compressor; + typedef WilsonImplParams ImplParams; + typedef WilsonStencil StencilImpl; + + ImplParams Params; + + DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){}; + + bool overlapCommsCompute(void) { return false; }; + + template + inline void loadLinkElement(Simd ®, ref &memory) { + vsplat(reg, memory); + } + + inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, int mu, StencilEntry *SE, + StencilImpl &St) { + SiteGaugeLink UU; + for (int i = 0; i < Nrepresentation; i++) { + for (int j = 0; j < Nrepresentation; j++) { + vsplat(UU()()(i, j), U(mu)()(i, j)); } - }; - - /////// - // Single flavour four spinors with colour index, 5d redblack - /////// - template - class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { - public: + } + mult(&phi(), &UU(), &chi()); + } - static const int Dimension = Nrepresentation; - const bool LsVectorised=true; - typedef _Coeff_t Coeff_t; - typedef PeriodicGaugeImpl > Gimpl; - - INHERIT_GIMPL_TYPES(Gimpl); + inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) + { + SiteScalarGaugeField ScalarUmu; + SiteDoubledGaugeField ScalarUds; + + GaugeLinkField U(Umu._grid); + GaugeField Uadj(Umu._grid); + for (int mu = 0; mu < Nd; mu++) { + U = PeekIndex(Umu, mu); + U = adj(Cshift(U, mu, -1)); + PokeIndex(Uadj, U, mu); + } + + for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { + std::vector lcoor; + GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - template using iImplSpinor = iScalar, Ns> >; - template using iImplHalfSpinor = iScalar, Nhs> >; - template using iImplDoubledGaugeField = iVector >, Nds>; - template using iImplGaugeField = iVector >, Nd>; - template using iImplGaugeLink = iScalar > >; + peekLocalSite(ScalarUmu, Umu, lcoor); + for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); - typedef iImplSpinor SiteSpinor; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef Lattice FermionField; + peekLocalSite(ScalarUmu, Uadj, lcoor); + for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); - // Make the doubled gauge field a *scalar* - typedef iImplDoubledGaugeField - SiteDoubledGaugeField; // This is a scalar - typedef iImplGaugeField - SiteScalarGaugeField; // scalar - typedef iImplGaugeLink - SiteScalarGaugeLink; // scalar + pokeLocalSite(ScalarUds, Uds, lcoor); + } + } - typedef Lattice DoubledGaugeField; + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) + { + assert(0); + } - typedef WilsonCompressor Compressor; - typedef WilsonImplParams ImplParams; - typedef WilsonStencil StencilImpl; - - ImplParams Params; - - DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){}; - - bool overlapCommsCompute(void) { return false; }; - - template - inline void loadLinkElement(Simd ®, ref &memory) { - vsplat(reg, memory); - } - inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, - const SiteHalfSpinor &chi, int mu, StencilEntry *SE, - StencilImpl &St) { - SiteGaugeLink UU; - for (int i = 0; i < Nrepresentation; i++) { - for (int j = 0; j < Nrepresentation; j++) { - vsplat(UU()()(i, j), U(mu)()(i, j)); - } - } - mult(&phi(), &UU(), &chi()); - } - - inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds, - const GaugeField &Umu) { - SiteScalarGaugeField ScalarUmu; - SiteDoubledGaugeField ScalarUds; - - GaugeLinkField U(Umu._grid); - GaugeField Uadj(Umu._grid); - for (int mu = 0; mu < Nd; mu++) { - U = PeekIndex(Umu, mu); - U = adj(Cshift(U, mu, -1)); - PokeIndex(Uadj, U, mu); - } - - for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { - std::vector lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - - peekLocalSite(ScalarUmu, Umu, lcoor); - for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); - - peekLocalSite(ScalarUmu, Uadj, lcoor); - for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); - - pokeLocalSite(ScalarUds, Uds, lcoor); - } - } - - inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, - FermionField &A, int mu) { + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField Ã, int mu) + { assert(0); - } - - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, - FermionField Ã, int mu) { - assert(0); - } - }; + } +}; //////////////////////////////////////////////////////////////////////////////////////// // Flavour doubled spinors; is Gparity the only? what about C*? //////////////////////////////////////////////////////////////////////////////////////// - template - class GparityWilsonImpl - : public ConjugateGaugeImpl > { - public: - static const int Dimension = Nrepresentation; +template +class GparityWilsonImpl : public ConjugateGaugeImpl > { + public: - const bool LsVectorised=false; + static const int Dimension = Nrepresentation; - typedef _Coeff_t Coeff_t; - typedef ConjugateGaugeImpl< GaugeImplTypes > Gimpl; + const bool LsVectorised=false; + + typedef _Coeff_t Coeff_t; + typedef ConjugateGaugeImpl< GaugeImplTypes > Gimpl; + + INHERIT_GIMPL_TYPES(Gimpl); - INHERIT_GIMPL_TYPES(Gimpl); + template using iImplSpinor = iVector, Ns>, Ngp>; + template using iImplHalfSpinor = iVector, Nhs>, Ngp>; + template using iImplDoubledGaugeField = iVector >, Nds>, Ngp>; - template - using iImplSpinor = - iVector, Ns>, Ngp>; - template - using iImplHalfSpinor = - iVector, Nhs>, Ngp>; - template - using iImplDoubledGaugeField = - iVector >, Nds>, Ngp>; + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplDoubledGaugeField SiteDoubledGaugeField; + + typedef Lattice FermionField; + typedef Lattice DoubledGaugeField; + + typedef WilsonCompressor Compressor; + typedef WilsonStencil StencilImpl; + + typedef GparityWilsonImplParams ImplParams; - typedef iImplSpinor SiteSpinor; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef iImplDoubledGaugeField SiteDoubledGaugeField; - - typedef Lattice FermionField; - typedef Lattice DoubledGaugeField; - - typedef WilsonCompressor Compressor; - typedef WilsonStencil StencilImpl; + ImplParams Params; - typedef GparityWilsonImplParams ImplParams; - - ImplParams Params; + GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; + bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; + // provide the multiply by link that is differentiated between Gparity (with + // flavour index) and non-Gparity + inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, int mu, StencilEntry *SE, + StencilImpl &St) { - bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - - // provide the multiply by link that is differentiated between Gparity (with - // flavour index) and non-Gparity - inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, - const SiteHalfSpinor &chi, int mu, StencilEntry *SE, - StencilImpl &St) { - typedef SiteHalfSpinor vobj; - typedef typename SiteHalfSpinor::scalar_object sobj; + typedef SiteHalfSpinor vobj; + typedef typename SiteHalfSpinor::scalar_object sobj; - vobj vtmp; - sobj stmp; + vobj vtmp; + sobj stmp; - GridBase *grid = St._grid; + GridBase *grid = St._grid; - const int Nsimd = grid->Nsimd(); + const int Nsimd = grid->Nsimd(); - int direction = St._directions[mu]; - int distance = St._distances[mu]; - int ptype = St._permute_type[mu]; - int sl = St._grid->_simd_layout[direction]; + int direction = St._directions[mu]; + int distance = St._distances[mu]; + int ptype = St._permute_type[mu]; + int sl = St._grid->_simd_layout[direction]; + + // Fixme X.Y.Z.T hardcode in stencil + int mmu = mu % Nd; - // Fixme X.Y.Z.T hardcode in stencil - int mmu = mu % Nd; + // assert our assumptions + assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code + assert((sl == 1) || (sl == 2)); + + std::vector icoor; - // assert our assumptions - assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code - assert((sl == 1) || (sl == 2)); - - std::vector icoor; - - if ( SE->_around_the_world && Params.twists[mmu] ) { + if ( SE->_around_the_world && Params.twists[mmu] ) { - if ( sl == 2 ) { + if ( sl == 2 ) { + + std::vector vals(Nsimd); - std::vector vals(Nsimd); + extract(chi,vals); + for(int s=0;siCoorFromIindex(icoor,s); + grid->iCoorFromIindex(icoor,s); - assert((icoor[direction]==0)||(icoor[direction]==1)); + assert((icoor[direction]==0)||(icoor[direction]==1)); - int permute_lane; - if ( distance == 1) { - permute_lane = icoor[direction]?1:0; - } else { - permute_lane = icoor[direction]?0:1; + int permute_lane; + if ( distance == 1) { + permute_lane = icoor[direction]?1:0; + } else { + permute_lane = icoor[direction]?0:1; + } + + if ( permute_lane ) { + stmp(0) = vals[s](1); + stmp(1) = vals[s](0); + vals[s] = stmp; } - - if ( permute_lane ) { - stmp(0) = vals[s](1); - stmp(1) = vals[s](0); - vals[s] = stmp; - } - } - merge(vtmp,vals); + } + merge(vtmp,vals); + + } else { + vtmp(0) = chi(1); + vtmp(1) = chi(0); + } + mult(&phi(0),&U(0)(mu),&vtmp(0)); + mult(&phi(1),&U(1)(mu),&vtmp(1)); + + } else { + mult(&phi(0),&U(0)(mu),&chi(0)); + mult(&phi(1),&U(1)(mu),&chi(1)); + } + + } - } else { - vtmp(0) = chi(1); - vtmp(1) = chi(0); - } - mult(&phi(0),&U(0)(mu),&vtmp(0)); - mult(&phi(1),&U(1)(mu),&vtmp(1)); - - } else { - mult(&phi(0),&U(0)(mu),&chi(0)); - mult(&phi(1),&U(1)(mu),&chi(1)); - } + inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) + { + conformable(Uds._grid,GaugeGrid); + conformable(Umu._grid,GaugeGrid); + + GaugeLinkField Utmp (GaugeGrid); + GaugeLinkField U (GaugeGrid); + GaugeLinkField Uconj(GaugeGrid); + + Lattice > coor(GaugeGrid); - } - - inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) - { - - conformable(Uds._grid,GaugeGrid); - conformable(Umu._grid,GaugeGrid); - - GaugeLinkField Utmp (GaugeGrid); - GaugeLinkField U (GaugeGrid); - GaugeLinkField Uconj(GaugeGrid); - - Lattice > coor(GaugeGrid); - - - for(int mu=0;mu(Umu,mu); - Uconj = conjugate(U); + U = PeekIndex(Umu,mu); + Uconj = conjugate(U); + + // This phase could come from a simple bc 1,1,-1,1 .. + int neglink = GaugeGrid->GlobalDimensions()[mu]-1; + if ( Params.twists[mu] ) { + Uconj = where(coor==neglink,-Uconj,Uconj); + } - // This phase could come from a simple bc 1,1,-1,1 .. - int neglink = GaugeGrid->GlobalDimensions()[mu]-1; - if ( Params.twists[mu] ) { - Uconj = where(coor==neglink,-Uconj,Uconj); - } +PARALLEL_FOR_LOOP + for(auto ss=U.begin();ss(outerProduct(Btilde, A)); - PARALLEL_FOR_LOOP - for (auto ss = tmp.begin(); ss < tmp.end(); ss++) { - link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1)); - } - PokeIndex(mat, link, mu); - return; - } + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) { + + // DhopDir provides U or Uconj depending on coor/flavour. + GaugeLinkField link(mat._grid); + // use lorentz for flavour as hack. + auto tmp = TraceIndex(outerProduct(Btilde, A)); +PARALLEL_FOR_LOOP + for (auto ss = tmp.begin(); ss < tmp.end(); ss++) { + link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1)); + } + PokeIndex(mat, link, mu); + return; + } - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, - FermionField Ã, int mu) { - int Ls = Btilde._grid->_fdimensions[0]; + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) { + + int Ls = Btilde._grid->_fdimensions[0]; - GaugeLinkField tmp(mat._grid); - tmp = zero; - PARALLEL_FOR_LOOP - for (int ss = 0; ss < tmp._grid->oSites(); ss++) { - for (int s = 0; s < Ls; s++) { - int sF = s + Ls * ss; - auto ttmp = traceIndex(outerProduct(Btilde[sF], Atilde[sF])); - tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); - } - } - PokeIndex(mat, tmp, mu); - return; - } - }; + GaugeLinkField tmp(mat._grid); + tmp = zero; +PARALLEL_FOR_LOOP + for (int ss = 0; ss < tmp._grid->oSites(); ss++) { + for (int s = 0; s < Ls; s++) { + int sF = s + Ls * ss; + auto ttmp = traceIndex(outerProduct(Btilde[sF], Atilde[sF])); + tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); + } + } + PokeIndex(mat, tmp, mu); + return; + } - typedef WilsonImpl WilsonImplR; // Real.. whichever prec - typedef WilsonImpl WilsonImplF; // Float - typedef WilsonImpl WilsonImplD; // Double +}; + typedef WilsonImpl WilsonImplR; // Real.. whichever prec + typedef WilsonImpl WilsonImplF; // Float + typedef WilsonImpl WilsonImplD; // Double - typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec - typedef WilsonImpl ZWilsonImplF; // Float - typedef WilsonImpl ZWilsonImplD; // Double + typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec + typedef WilsonImpl ZWilsonImplF; // Float + typedef WilsonImpl ZWilsonImplD; // Double + + typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec + typedef WilsonImpl WilsonAdjImplF; // Float + typedef WilsonImpl WilsonAdjImplD; // Double + + typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec + typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float + typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double + + typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec + typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float + typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double + + typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec + typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float + typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double + + typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec + typedef GparityWilsonImpl GparityWilsonImplF; // Float + typedef GparityWilsonImpl GparityWilsonImplD; // Double - typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec - typedef WilsonImpl WilsonAdjImplF; // Float - typedef WilsonImpl WilsonAdjImplD; // Double +}} - typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec - typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float - typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double - - typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double - - typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double - - typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec - typedef GparityWilsonImpl GparityWilsonImplF; // Float - typedef GparityWilsonImpl GparityWilsonImplD; // Double -} -} #endif diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index d887e05b..4bc28bc7 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -166,7 +166,7 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, //////////////////////// PARALLEL_FOR_LOOP for (int sss = 0; sss < B._grid->oSites(); sss++) { - Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu, + Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma); } @@ -277,7 +277,7 @@ void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out, PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out, + Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); } }; @@ -295,13 +295,13 @@ void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, if (dag == DaggerYes) { PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in, + Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); } } else { PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in, + Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); } } diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 7c4fafff..8c059667 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -184,44 +184,37 @@ void WilsonFermion5D::Report(void) if ( DhopCalls > 0 ) { std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime - << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " - << DhopCommTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " - << DhopComputeTime << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " - << DhopComputeTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " << DhopComputeTime << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; } if ( DerivCalls > 0 ) { - std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " < 0 || DhopCalls > 0){ - std::cout << GridLogMessage << "WilsonFermion5D Stencil"<::DerivInternal(StencilImpl & st, assert(sF < B._grid->oSites()); assert(sU < U._grid->oSites()); - Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu, - gamma); + Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma); //////////////////////////// // spin trace outer product @@ -342,10 +334,10 @@ void WilsonFermion5D::DerivInternal(StencilImpl & st, } template -void WilsonFermion5D::DhopDeriv( GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag) +void WilsonFermion5D::DhopDeriv(GaugeField &mat, + const FermionField &A, + const FermionField &B, + int dag) { conformable(A._grid,FermionGrid()); conformable(A._grid,B._grid); @@ -358,9 +350,9 @@ void WilsonFermion5D::DhopDeriv( GaugeField &mat, template void WilsonFermion5D::DhopDerivEO(GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag) + const FermionField &A, + const FermionField &B, + int dag) { conformable(A._grid,FermionRedBlackGrid()); conformable(GaugeRedBlackGrid(),mat._grid); @@ -376,9 +368,9 @@ void WilsonFermion5D::DhopDerivEO(GaugeField &mat, template void WilsonFermion5D::DhopDerivOE(GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag) + const FermionField &A, + const FermionField &B, + int dag) { conformable(A._grid,FermionRedBlackGrid()); conformable(GaugeRedBlackGrid(),mat._grid); @@ -393,8 +385,8 @@ void WilsonFermion5D::DhopDerivOE(GaugeField &mat, template void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, - DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) + DoubledGaugeField & U, + const FermionField &in, FermionField &out,int dag) { // assert((dag==DaggerNo) ||(dag==DaggerYes)); Compressor compressor(dag); @@ -412,27 +404,25 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU = ss; int sF = LLs * sU; - Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, - out); + Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out); } #ifdef AVX512 } else if (stat.is_init() ) { int nthreads; stat.start(); - #pragma omp parallel +#pragma omp parallel { - #pragma omp master +#pragma omp master nthreads = omp_get_num_threads(); int mythread = omp_get_thread_num(); stat.enter(mythread); - #pragma omp for nowait - for(int ss=0;ssoSites();ss++) - { - int sU=ss; - int sF=LLs*sU; - Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out); - } +#pragma omp for nowait + for(int ss=0;ssoSites();ss++) { + int sU=ss; + int sF=LLs*sU; + Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); + } stat.exit(mythread); } stat.accum(nthreads); @@ -442,8 +432,7 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU = ss; int sF = LLs * sU; - Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, - out); + Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); } } DhopComputeTime+=usecond(); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index bc7ec543..295e2997 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -34,155 +34,154 @@ Author: paboyle #include namespace Grid { +namespace QCD { - namespace QCD { + //////////////////////////////////////////////////////////////////////////////// + // This is the 4d red black case appropriate to support + // + // parity = (x+y+z+t)|2; + // generalised five dim fermions like mobius, zolotarev etc.. + // + // i.e. even even contains fifth dim hopping term. + // + // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ] + //////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////// - // This is the 4d red black case appropriate to support - // - // parity = (x+y+z+t)|2; - // generalised five dim fermions like mobius, zolotarev etc.. - // - // i.e. even even contains fifth dim hopping term. - // - // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ] - //////////////////////////////////////////////////////////////////////////////// - - class WilsonFermion5DStatic { - public: - // S-direction is INNERMOST and takes no part in the parity. - static const std::vector directions; - static const std::vector displacements; - const int npoint = 8; - }; - - template - class WilsonFermion5D : public WilsonKernels, public WilsonFermion5DStatic - { - public: - INHERIT_IMPL_TYPES(Impl); - typedef WilsonKernels Kernels; - PmuStat stat; - - void Report(void); - void ZeroCounters(void); - double DhopCalls; - double DhopCommTime; - double DhopComputeTime; - - double DerivCalls; - double DerivCommTime; - double DerivComputeTime; - double DerivDhopComputeTime; - - /////////////////////////////////////////////////////////////// - // Implement the abstract base - /////////////////////////////////////////////////////////////// - GridBase *GaugeGrid(void) { return _FourDimGrid ;} - GridBase *GaugeRedBlackGrid(void) { return _FourDimRedBlackGrid ;} - GridBase *FermionGrid(void) { return _FiveDimGrid;} - GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;} - - // full checkerboard operations; leave unimplemented as abstract for now - virtual RealD M (const FermionField &in, FermionField &out){assert(0); return 0.0;}; - virtual RealD Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;}; - - // half checkerboard operations; leave unimplemented as abstract for now - virtual void Meooe (const FermionField &in, FermionField &out){assert(0);}; - virtual void Mooee (const FermionField &in, FermionField &out){assert(0);}; - virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);}; - - virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);}; - virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; - virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; - virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac - - // These can be overridden by fancy 5d chiral action - virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag); - virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); - virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); - - // Implement hopping term non-hermitian hopping term; half cb or both - // Implement s-diagonal DW - void DW (const FermionField &in, FermionField &out,int dag); - void Dhop (const FermionField &in, FermionField &out,int dag); - void DhopOE(const FermionField &in, FermionField &out,int dag); - void DhopEO(const FermionField &in, FermionField &out,int dag); - - // add a DhopComm + class WilsonFermion5DStatic { + public: + // S-direction is INNERMOST and takes no part in the parity. + static const std::vector directions; + static const std::vector displacements; + const int npoint = 8; + }; + + template + class WilsonFermion5D : public WilsonKernels, public WilsonFermion5DStatic + { + public: + INHERIT_IMPL_TYPES(Impl); + typedef WilsonKernels Kernels; + PmuStat stat; + + void Report(void); + void ZeroCounters(void); + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + + double DerivCalls; + double DerivCommTime; + double DerivComputeTime; + double DerivDhopComputeTime; + + /////////////////////////////////////////////////////////////// + // Implement the abstract base + /////////////////////////////////////////////////////////////// + GridBase *GaugeGrid(void) { return _FourDimGrid ;} + GridBase *GaugeRedBlackGrid(void) { return _FourDimRedBlackGrid ;} + GridBase *FermionGrid(void) { return _FiveDimGrid;} + GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;} + + // full checkerboard operations; leave unimplemented as abstract for now + virtual RealD M (const FermionField &in, FermionField &out){assert(0); return 0.0;}; + virtual RealD Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;}; + + // half checkerboard operations; leave unimplemented as abstract for now + virtual void Meooe (const FermionField &in, FermionField &out){assert(0);}; + virtual void Mooee (const FermionField &in, FermionField &out){assert(0);}; + virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);}; + + virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);}; + virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; + virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; + virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac + + // These can be overridden by fancy 5d chiral action + virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag); + virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); + virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); + + // Implement hopping term non-hermitian hopping term; half cb or both + // Implement s-diagonal DW + void DW (const FermionField &in, FermionField &out,int dag); + void Dhop (const FermionField &in, FermionField &out,int dag); + void DhopOE(const FermionField &in, FermionField &out,int dag); + void DhopEO(const FermionField &in, FermionField &out,int dag); + + // add a DhopComm // -- suboptimal interface will presently trigger multiple comms. - void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); - - /////////////////////////////////////////////////////////////// - // New methods added - /////////////////////////////////////////////////////////////// - void DerivInternal(StencilImpl & st, - DoubledGaugeField & U, - GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag); - - void DhopInternal(StencilImpl & st, - LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, - int dag); - - // Constructors - WilsonFermion5D(GaugeField &_Umu, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - GridRedBlackCartesian &FourDimRedBlackGrid, - double _M5,const ImplParams &p= ImplParams()); - - // Constructors - /* + void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); + + /////////////////////////////////////////////////////////////// + // New methods added + /////////////////////////////////////////////////////////////// + void DerivInternal(StencilImpl & st, + DoubledGaugeField & U, + GaugeField &mat, + const FermionField &A, + const FermionField &B, + int dag); + + void DhopInternal(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, + int dag); + + // Constructors + WilsonFermion5D(GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + double _M5,const ImplParams &p= ImplParams()); + + // Constructors + /* WilsonFermion5D(int simd, - GaugeField &_Umu, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - double _M5,const ImplParams &p= ImplParams()); - */ + GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + double _M5,const ImplParams &p= ImplParams()); + */ + + // DoubleStore + void ImportGauge(const GaugeField &_Umu); + + /////////////////////////////////////////////////////////////// + // Data members require to support the functionality + /////////////////////////////////////////////////////////////// + public: + + // Add these to the support from Wilson + GridBase *_FourDimGrid; + GridBase *_FourDimRedBlackGrid; + GridBase *_FiveDimGrid; + GridBase *_FiveDimRedBlackGrid; + + double M5; + int Ls; + + //Defines the stencils for even and odd + StencilImpl Stencil; + StencilImpl StencilEven; + StencilImpl StencilOdd; + + // Copy of the gauge field , with even and odd subsets + DoubledGaugeField Umu; + DoubledGaugeField UmuEven; + DoubledGaugeField UmuOdd; + + LebesgueOrder Lebesgue; + LebesgueOrder LebesgueEvenOdd; + + // Comms buffer + std::vector > comm_buf; + + }; - // DoubleStore - void ImportGauge(const GaugeField &_Umu); - - /////////////////////////////////////////////////////////////// - // Data members require to support the functionality - /////////////////////////////////////////////////////////////// - public: - - // Add these to the support from Wilson - GridBase *_FourDimGrid; - GridBase *_FourDimRedBlackGrid; - GridBase *_FiveDimGrid; - GridBase *_FiveDimRedBlackGrid; - - double M5; - int Ls; - - //Defines the stencils for even and odd - StencilImpl Stencil; - StencilImpl StencilEven; - StencilImpl StencilOdd; - - // Copy of the gauge field , with even and odd subsets - DoubledGaugeField Umu; - DoubledGaugeField UmuEven; - DoubledGaugeField UmuOdd; - - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - - // Comms buffer - std::vector > comm_buf; - - }; - } -} +}} #endif diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 58e62cc8..52ee8704 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -43,10 +43,9 @@ WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; //////////////////////////////////////////// template -void WilsonKernels::DiracOptGenericDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, int sF, - int sU, const FermionField &in, FermionField &out) { +void WilsonKernels::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) { SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -220,10 +219,9 @@ void WilsonKernels::DiracOptGenericDhopSiteDag( // Need controls to do interior, exterior, or both template -void WilsonKernels::DiracOptGenericDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, int sF, - int sU, const FermionField &in, FermionField &out) { +void WilsonKernels::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) { SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -396,10 +394,9 @@ void WilsonKernels::DiracOptGenericDhopSite( }; template -void WilsonKernels::DiracOptDhopDir( - StencilImpl &st, DoubledGaugeField &U, - commVector &buf, int sF, - int sU, const FermionField &in, FermionField &out, int dir, int gamma) { +void WilsonKernels::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out, int dir, int gamma) { + SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteSpinor result; diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 66770c3c..669ee4be 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -32,175 +32,132 @@ directory #define GRID_QCD_DHOP_H namespace Grid { +namespace QCD { - namespace QCD { - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Helper routines that implement Wilson stencil for a single site. - // Common to both the WilsonFermion and WilsonFermion5D - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// - class WilsonKernelsStatic { - public: - // S-direction is INNERMOST and takes no part in the parity. - static int AsmOpt; // these are a temporary hack - static int HandOpt; // these are a temporary hack - }; - - template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { - public: - - INHERIT_IMPL_TYPES(Impl); - typedef FermionOperator Base; - - public: - - template - typename std::enable_if::type - DiracOptDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Helper routines that implement Wilson stencil for a single site. + // Common to both the WilsonFermion and WilsonFermion5D + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class WilsonKernelsStatic { + public: + // S-direction is INNERMOST and takes no part in the parity. + static int AsmOpt; // these are a temporary hack + static int HandOpt; // these are a temporary hack +}; + +template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { + public: + + INHERIT_IMPL_TYPES(Impl); + typedef FermionOperator Base; + +public: + + template + typename std::enable_if::type + DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { #ifdef AVX512 - if (AsmOpt) { - WilsonKernels::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns, - in, out); - - } else { + if (AsmOpt) { + WilsonKernels::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + } else { #else - { + { #endif - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if (HandOpt) - WilsonKernels::DiracOptHandDhopSite(st, lo, U, buf, sF, sU, - in, out); - else - WilsonKernels::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, - in, out); - sF++; - } - sU++; - } - } + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if (HandOpt) + WilsonKernels::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out); + else + WilsonKernels::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out); + sF++; } - - template - typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type - DiracOptDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, - out); - sF++; - } - sU++; - } - } - - template - typename std::enable_if::type - DiracOptDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { -#ifdef AVX512 - if (AsmOpt) { - WilsonKernels::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls, - Ns, in, out); - } else { -#else - { -#endif - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if (HandOpt) - WilsonKernels::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU, - in, out); - else - WilsonKernels::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, - sU, in, out); - sF++; - } - sU++; - } - } - } - - template - typename std::enable_if< - (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, - void>::type - DiracOptDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU, - in, out); - sF++; - } - sU++; - } - } - - void DiracOptDhopDir( - StencilImpl &st, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, - int gamma); - - private: - // Specialised variants - void DiracOptGenericDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - void DiracOptGenericDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - void DiracOptAsmDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out); - - void DiracOptAsmDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out); - - void DiracOptHandDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - void DiracOptHandDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - public: - WilsonKernels(const ImplParams &p = ImplParams()); - }; - + sU++; } } + } + + template + typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type + DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { + + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + WilsonKernels::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out); + sF++; + } + sU++; + } + } + + template + typename std::enable_if::type + DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { +#ifdef AVX512 + if (AsmOpt) { + WilsonKernels::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + } else { +#else + { +#endif + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if (HandOpt) + WilsonKernels::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else + WilsonKernels::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } + } + } + template + typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type + DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + WilsonKernels::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } + } + void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma); + +private: + // Specialised variants + void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + + void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + + void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + +public: + + WilsonKernels(const ImplParams &p = ImplParams()); + +}; + +}} #endif diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 7857e89a..74862400 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -33,31 +33,27 @@ Author: paboyle namespace Grid { - namespace QCD { +namespace QCD { - /////////////////////////////////////////////////////////// - // Default to no assembler implementation - /////////////////////////////////////////////////////////// - template - void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) - { - assert(0); - } - template - void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) - { - assert(0); - } - +/////////////////////////////////////////////////////////// +// Default to no assembler implementation +/////////////////////////////////////////////////////////// +template void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} +template void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} #if defined(AVX512) - /////////////////////////////////////////////////////////// // If we are AVX512 specialise the single precision routine /////////////////////////////////////////////////////////// @@ -65,16 +61,16 @@ namespace Grid { #include #include - static Vector signs; +static Vector signs; - int setupSigns(void ){ - Vector bother(2); - signs = bother; - vrsign(signs[0]); - visign(signs[1]); - return 1; - } - static int signInit = setupSigns(); + int setupSigns(void ){ + Vector bother(2); + signs = bother; + vrsign(signs[0]); + visign(signs[1]); + return 1; + } + static int signInit = setupSigns(); #define label(A) ilabel(A) #define ilabel(A) ".globl\n" #A ":\n" @@ -84,17 +80,15 @@ namespace Grid { #define FX(A) WILSONASM_ ##A #undef KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #define KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #undef VMOVIDUP @@ -109,31 +103,26 @@ namespace Grid { #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #undef KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #define KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #endif - #define INSTANTIATE_ASM(A)\ -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\ - commVector &buf,\ +template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ -template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\ - commVector &buf,\ + \ +template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ - INSTANTIATE_ASM(WilsonImplF); INSTANTIATE_ASM(WilsonImplD); INSTANTIATE_ASM(ZWilsonImplF); @@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF); INSTANTIATE_ASM(DomainWallVec5dImplD); INSTANTIATE_ASM(ZDomainWallVec5dImplF); INSTANTIATE_ASM(ZDomainWallVec5dImplD); - } -} + +}} diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 9d7eac23..f5900832 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -311,10 +311,9 @@ namespace Grid { namespace QCD { - template - void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int ss,int sU,const FermionField &in, FermionField &out) +template void +WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; @@ -554,10 +553,9 @@ namespace QCD { } } - template - void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int ss,int sU,const FermionField &in, FermionField &out) +template +void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) { // std::cout << "Hand op Dhop "< -void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } -template<> -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } -template<> -void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } -template<> -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } @@ -840,12 +835,10 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st, // Need Nc=3 though // #define INSTANTIATE_THEM(A) \ -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ - commVector &buf,\ - int ss,int sU,const FermionField &in, FermionField &out);\ -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ - commVector &buf,\ - int ss,int sU,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplD); diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc index 0e35a414..1b71b8a5 100644 --- a/tests/Test_stencil.cc +++ b/tests/Test_stencil.cc @@ -116,7 +116,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) Check._odata[i] = Foo._odata[SE->_offset]; else - Check._odata[i] = myStencil.comm_buf[SE->_offset]; + Check._odata[i] = myStencil.CommBuf()[SE->_offset]; } Real nrmC = norm2(Check); @@ -207,7 +207,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) OCheck._odata[i] = EFoo._odata[SE->_offset]; else - OCheck._odata[i] = EStencil.comm_buf[SE->_offset]; + OCheck._odata[i] = EStencil.CommBuf()[SE->_offset]; } for(int i=0;ioSites();i++){ int permute_type; @@ -220,7 +220,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) ECheck._odata[i] = OFoo._odata[SE->_offset]; else - ECheck._odata[i] = OStencil.comm_buf[SE->_offset]; + ECheck._odata[i] = OStencil.CommBuf()[SE->_offset]; } setCheckerboard(Check,ECheck);