diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 6a283085..c9e2fa22 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -153,9 +153,10 @@ int main (int argc, char ** argv) std::cout< 1.0e-6 ) { - std::cout << "site "<1.0e-5) { + setCheckerboard(ssrc,ssrc_o); + setCheckerboard(ssrc,ssrc_e); + std::cout<< ssrc << std::endl; + } } @@ -307,7 +314,7 @@ int main (int argc, char ** argv) std::cout< dimensions; std::vector processors; std::vector processor_coor; - + public: - - static const int forward =FFTW_FORWARD; + + static const int forward=FFTW_FORWARD; static const int backward=FFTW_BACKWARD; - + double Flops(void) {return flops;} double MFlops(void) {return flops/usec;} - - FFT ( GridCartesian * grid ) : - vgrid(grid), - Nd(grid->_ndimension), - dimensions(grid->_fdimensions), - processors(grid->_processors), - processor_coor(grid->_processor_coor) + + FFT ( GridCartesian * grid ) : + vgrid(grid), + Nd(grid->_ndimension), + dimensions(grid->_fdimensions), + processors(grid->_processors), + processor_coor(grid->_processor_coor) { flops=0; usec =0; std::vector layout(Nd,1); sgrid = new GridCartesian(dimensions,layout,processors); }; - - ~FFT ( void) { - delete sgrid; + + ~FFT ( void) { + delete sgrid; } template @@ -164,145 +164,118 @@ namespace Grid { template void FFT_dim(Lattice &result,const Lattice &source,int dim, int sign){ - +#ifndef HAVE_FFTW + assert(0); +#else conformable(result._grid,vgrid); conformable(source._grid,vgrid); int L = vgrid->_ldimensions[dim]; int G = vgrid->_fdimensions[dim]; - + std::vector layout(Nd,1); std::vector pencil_gd(vgrid->_fdimensions); - - pencil_gd[dim] = G*processors[dim]; - + + pencil_gd[dim] = G*processors[dim]; + // Pencil global vol LxLxGxLxL per node GridCartesian pencil_g(pencil_gd,layout,processors); - + // Construct pencils typedef typename vobj::scalar_object sobj; typedef typename sobj::scalar_type scalar; - /* - std::cout << "FFT : vobj "< pgbuf(&pencil_g); + - Lattice ssource(vgrid); ssource =source; - Lattice pgsource(&pencil_g); - Lattice pgresult(&pencil_g); pgresult=zero; - -#ifndef HAVE_FFTW - assert(0); -#else typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; - - { - int Ncomp = sizeof(sobj)/sizeof(scalar); - int Nlow = 1; - for(int d=0;d_ldimensions[d]; - } - - int rank = 1; /* 1d transforms */ - int n[] = {G}; /* 1d transforms of length G */ - int howmany = Ncomp; - int odist,idist,istride,ostride; - idist = odist = 1; /* Distance between consecutive FT's */ - istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ - int *inembed = n, *onembed = n; - - scalar div; - if ( sign == backward ) div = 1.0/G; - else if ( sign == forward ) div = 1.0; - else assert(0); - - FFTW_plan p; - { - FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0]; - FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0]; - p = FFTW::fftw_plan_many_dft(rank,n,howmany, - in,inembed, - istride,idist, - out,onembed, - ostride, odist, - sign,FFTW_ESTIMATE); - } - - double add,mul,fma; - FFTW::fftw_flops(p,&add,&mul,&fma); - flops_call = add+mul+2.0*fma; - - GridStopWatch timer; - - // Barrel shift and collect global pencil - for(int p=0;plSites();idx++) { - - std::vector lcoor(Nd); - sgrid->LocalIndexToLocalCoor(idx,lcoor); - - sobj s; - - peekLocalSite(s,ssource,lcoor); - - lcoor[dim]+=p*L; - - pokeLocalSite(s,pgsource,lcoor); - } - - ssource = Cshift(ssource,dim,L); - } - - // Loop over orthog coords - int NN=pencil_g.lSites(); - - GridStopWatch Timer; - Timer.Start(); - -PARALLEL_FOR_LOOP - for(int idx=0;idx lcoor(Nd); - pencil_g.LocalIndexToLocalCoor(idx,lcoor); - - if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0 - FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx]; - FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx]; - FFTW::fftw_execute_dft(p,in,out); - } - } - - Timer.Stop(); - usec += Timer.useconds(); - flops+= flops_call*NN; - - int pc = processor_coor[dim]; - for(int idx=0;idxlSites();idx++) { - std::vector lcoor(Nd); - sgrid->LocalIndexToLocalCoor(idx,lcoor); - std::vector gcoor = lcoor; - // extract the result - sobj s; - gcoor[dim] = lcoor[dim]+L*pc; - peekLocalSite(s,pgresult,gcoor); - s = s * div; - pokeLocalSite(s,result,lcoor); - } - - FFTW::fftw_destroy_plan(p); + + int Ncomp = sizeof(sobj)/sizeof(scalar); + int Nlow = 1; + for(int d=0;d_ldimensions[d]; } + + int rank = 1; /* 1d transforms */ + int n[] = {G}; /* 1d transforms of length G */ + int howmany = Ncomp; + int odist,idist,istride,ostride; + idist = odist = 1; /* Distance between consecutive FT's */ + istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ + int *inembed = n, *onembed = n; + + scalar div; + if ( sign == backward ) div = 1.0/G; + else if ( sign == forward ) div = 1.0; + else assert(0); + + FFTW_plan p; + { + FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0]; + FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0]; + p = FFTW::fftw_plan_many_dft(rank,n,howmany, + in,inembed, + istride,idist, + out,onembed, + ostride, odist, + sign,FFTW_ESTIMATE); + } + + // Barrel shift and collect global pencil + std::vector lcoor(Nd), gcoor(Nd); + result = source; + for(int p=0;plSites();idx++) { + sgrid->LocalIndexToLocalCoor(idx,lcoor); + sobj s; + peekLocalSite(s,result,lcoor); + lcoor[dim]+=p*L; + pokeLocalSite(s,pgbuf,lcoor); + } + result = Cshift(result,dim,L); + } + + // Loop over orthog coords + int NN=pencil_g.lSites(); + GridStopWatch timer; + timer.Start(); + //PARALLEL_FOR_LOOP + for(int idx=0;idx::fftw_execute_dft(p,in,out); + } + } + timer.Stop(); + + // performance counting + double add,mul,fma; + FFTW::fftw_flops(p,&add,&mul,&fma); + flops_call = add+mul+2.0*fma; + usec += timer.useconds(); + flops+= flops_call*NN; + + // writing out result + int pc = processor_coor[dim]; + for(int idx=0;idxlSites();idx++) { + sgrid->LocalIndexToLocalCoor(idx,lcoor); + gcoor = lcoor; + sobj s; + gcoor[dim] = lcoor[dim]+L*pc; + peekLocalSite(s,pgbuf,gcoor); + s = s * div; + pokeLocalSite(s,result,lcoor); + } + + // destroying plan + FFTW::fftw_destroy_plan(p); #endif - - } - }; - - } #endif diff --git a/lib/Init.cc b/lib/Init.cc index 06dae468..dbc5706c 100644 --- a/lib/Init.cc +++ b/lib/Init.cc @@ -195,14 +195,17 @@ std::string GridCmdVectorIntToString(const std::vector & vec){ ///////////////////////////////////////////////////////// // ///////////////////////////////////////////////////////// +static int Grid_is_initialised = 0; + + void Grid_init(int *argc,char ***argv) { + GridLogger::StopWatch.Start(); + CartesianCommunicator::Init(argc,argv); // Parse command line args. - GridLogger::StopWatch.Start(); - std::string arg; std::vector logstreams; std::string defaultLog("Error,Warning,Message,Performance"); @@ -240,11 +243,14 @@ void Grid_init(int *argc,char ***argv) if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ LebesgueOrder::UseLebesgueOrder=1; } - if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); GridCmdOptionIntVector(arg,LebesgueOrder::Block); } + if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){ + GridLogTimestamp(1); + } + GridParseLayout(*argv,*argc, Grid_default_latt, Grid_default_mpi); @@ -298,12 +304,14 @@ void Grid_init(int *argc,char ***argv) std::cout << "GNU General Public License for more details."< &GridDefaultMpi(void); const int &GridThreads(void) ; void GridSetThreads(int t) ; + void GridLogTimestamp(int); // Common parsing chores std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option); diff --git a/lib/Log.cc b/lib/Log.cc index 5152bd77..733bfc58 100644 --- a/lib/Log.cc +++ b/lib/Log.cc @@ -49,8 +49,13 @@ namespace Grid { } GridStopWatch Logger::StopWatch; +int Logger::timestamp; std::ostream Logger::devnull(0); +void GridLogTimestamp(int on){ + Logger::Timestamp(on); +} + Colours GridLogColours(0); GridLogger GridLogError(1, "Error", GridLogColours, "RED"); GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW"); @@ -88,7 +93,7 @@ void GridLogConfigure(std::vector &logstreams) { //////////////////////////////////////////////////////////// void Grid_quiesce_nodes(void) { int me = 0; -#ifdef GRID_COMMS_MPI +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) MPI_Comm_rank(MPI_COMM_WORLD, &me); #endif #ifdef GRID_COMMS_SHMEM diff --git a/lib/Log.h b/lib/Log.h index 9f9e19a7..d7422ca9 100644 --- a/lib/Log.h +++ b/lib/Log.h @@ -37,10 +37,11 @@ #include #endif - namespace Grid { +namespace Grid { +////////////////////////////////////////////////////////////////////////////////////////////////// // Dress the output; use std::chrono for time stamping via the StopWatch class -int Rank(void); // used for early stage debug before library init +////////////////////////////////////////////////////////////////////////////////////////////////// class Colours{ @@ -55,7 +56,6 @@ public: void Active(bool activate){ is_active=activate; - if (is_active){ colour["BLACK"] ="\033[30m"; colour["RED"] ="\033[31m"; @@ -66,21 +66,18 @@ public: colour["CYAN"] ="\033[36m"; colour["WHITE"] ="\033[37m"; colour["NORMAL"] ="\033[0;39m"; - } else { - colour["BLACK"] =""; - colour["RED"] =""; - colour["GREEN"] =""; - colour["YELLOW"]=""; - colour["BLUE"] =""; - colour["PURPLE"]=""; - colour["CYAN"] =""; - colour["WHITE"] =""; - colour["NORMAL"]=""; - } - - -}; - + } else { + colour["BLACK"] =""; + colour["RED"] =""; + colour["GREEN"] =""; + colour["YELLOW"]=""; + colour["BLUE"] =""; + colour["PURPLE"]=""; + colour["CYAN"] =""; + colour["WHITE"] =""; + colour["NORMAL"]=""; + } + }; }; @@ -88,6 +85,7 @@ class Logger { protected: Colours &Painter; int active; + static int timestamp; std::string name, topName; std::string COLOUR; @@ -99,25 +97,28 @@ public: std::string evidence() {return Painter.colour["YELLOW"];} std::string colour() {return Painter.colour[COLOUR];} - Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) - : active(on), - name(nm), - topName(topNm), - Painter(col_class), - COLOUR(col){} ; + Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) : active(on), + name(nm), + topName(topNm), + Painter(col_class), + COLOUR(col) {} ; void Active(int on) {active = on;}; int isActive(void) {return active;}; + static void Timestamp(int on) {timestamp = on;}; friend std::ostream& operator<< (std::ostream& stream, Logger& log){ if ( log.active ) { - StopWatch.Stop(); - GridTime now = StopWatch.Elapsed(); - StopWatch.Start(); stream << log.background()<< log.topName << log.background()<< " : "; stream << log.colour() < void -Gather_plane_simple_table_compute (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector >& table) +inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask, + int off,std::vector > & table) { table.resize(0); - int rd = rhs._grid->_rdimensions[dimension]; + int rd = grid->_rdimensions[dimension]; - if ( !rhs._grid->CheckerBoarded(dimension) ) { + if ( !grid->CheckerBoarded(dimension) ) { cbmask = 0x3; } - int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int e1=rhs._grid->_slice_nblock[dimension]; - int e2=rhs._grid->_slice_block[dimension]; + int so= plane*grid->_ostride[dimension]; // base offset for start of plane + int e1=grid->_slice_nblock[dimension]; + int e2=grid->_slice_block[dimension]; - int stride=rhs._grid->_slice_stride[dimension]; + int stride=grid->_slice_stride[dimension]; if ( cbmask == 0x3 ) { table.resize(e1*e2); for(int n=0;n &rhs,commVector &bu for(int n=0;nCheckerBoardFromOindexTable(o+b); + int ocb=1<CheckerBoardFromOindexTable(o+b); if ( ocb &cbmask ) { table[bo]=std::pair(bo,o+b); bo++; } @@ -109,8 +109,7 @@ Gather_plane_simple_table_compute (const Lattice &rhs,commVector &bu } template void -Gather_plane_simple_table (std::vector >& table,const Lattice &rhs,commVector &buffer, - compressor &compress, int off,int so) +Gather_plane_simple_table (std::vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) { PARALLEL_FOR_LOOP for(int i=0;i void -Gather_plane_simple_stencil (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask,compressor &compress, int off, - double &t_table ,double & t_data ) -{ - std::vector > table; - Gather_plane_simple_table_compute (rhs, buffer,dimension,plane,cbmask,compress,off,table); - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - Gather_plane_simple_table (table,rhs,buffer,compress,off,so); -} +struct StencilEntry { + uint64_t _offset; + uint64_t _byte_offset; + uint16_t _is_local; + uint16_t _permute; + uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline +}; +template +class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. + public: + typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; + typedef uint32_t StencilInteger; + typedef typename cobj::vector_type vector_type; + typedef typename cobj::scalar_type scalar_type; + typedef typename cobj::scalar_object scalar_object; + + ////////////////////////////////////////// + // Comms packet queue for asynch thread + ////////////////////////////////////////// + struct Packet { + void * send_buf; + void * recv_buf; + Integer to_rank; + Integer from_rank; + Integer bytes; + }; - struct StencilEntry { - uint64_t _offset; - uint64_t _byte_offset; - uint16_t _is_local; - uint16_t _permute; - uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline - }; + std::vector Packets; - template - class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. - public: + int face_table_computed; + std::vector > > face_table ; + + void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ + Packet p; + p.send_buf = xmit; + p.recv_buf = rcv; + p.to_rank = to; + p.from_rank= from; + p.bytes = bytes; + comms_bytes+=2.0*bytes; + Packets.push_back(p); + } - typedef uint32_t StencilInteger; - typedef typename cobj::vector_type vector_type; - typedef typename cobj::scalar_type scalar_type; - typedef typename cobj::scalar_object scalar_object; + void CommunicateBegin(std::vector > &reqs) + { + reqs.resize(Packets.size()); + commtime-=usecond(); + for(int i=0;iStencilSendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes); + /* + }else{ + _grid->SendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes); + } + */ + } + commtime+=usecond(); + } + void CommunicateComplete(std::vector > &reqs) + { + commtime-=usecond(); - ////////////////////////////////////////// - // Comms packet queue for asynch thread - ////////////////////////////////////////// + for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); + // else + // _grid->SendToRecvFromComplete(reqs[i]); + } + commtime+=usecond(); + } - struct Packet { - void * send_buf; - void * recv_buf; - Integer to_rank; - Integer from_rank; - Integer bytes; - volatile Integer done; - }; + /////////////////////////////////////////// + // Simd merge queue for asynch comms + /////////////////////////////////////////// + struct Merge { + cobj * mpointer; + std::vector rpointers; + Integer buffer_size; + Integer packet_id; + }; + + std::vector Mergers; - std::vector Packets; + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { + Merge m; + m.mpointer = merge_p; + m.rpointers= rpointers; + m.buffer_size = buffer_size; + m.packet_id = packet_id; + Mergers.push_back(m); + } - int face_table_computed; - std::vector > > face_table ; - -#define SEND_IMMEDIATE -#define SERIAL_SENDS + void CommsMerge(void ) { - void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ - #ifdef SEND_IMMEDIATE - commtime-=usecond(); - _grid->SendToRecvFrom(xmit,to,rcv,from,bytes); - commtime+=usecond(); - #endif - Packet p; - p.send_buf = xmit; - p.recv_buf = rcv; - p.to_rank = to; - p.from_rank= from; - p.bytes = bytes; - p.done = 0; - comms_bytes+=2.0*bytes; - Packets.push_back(p); + for(int i=0;iSendToRecvFrom( - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes); - #endif - Packets[i].done = 1; - } - commtime+=usecond(); - } - #else - void Communicate(void ) { - typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; - std::vector > reqs(Packets.size()); - commtime-=usecond(); - const int concurrency=2; - for(int i=0;iSendToRecvFromBegin(reqs[j], - Packets[j].send_buf, - Packets[j].to_rank, - Packets[j].recv_buf, - Packets[j].from_rank, - Packets[j].bytes); - #endif - } - } - for(int ii=0;iiSendToRecvFromComplete(reqs[i]); - #endif - } - } - for(int ii=0;ii _directions; + std::vector _distances; + std::vector _comm_buf_size; + std::vector _permute_type; + + // npoints x Osites() of these + // Flat vector, change layout for cache friendly. + Vector _entries; + + void PrecomputeByteOffsets(void){ + for(int i=0;i<_entries.size();i++){ + if( _entries[i]._is_local ) { + _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); + } else { + _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); + } + } + }; - /////////////////////////////////////////// - // Simd merge queue for asynch comms - /////////////////////////////////////////// - struct Merge { - cobj * mpointer; - std::vector rpointers; - Integer buffer_size; - Integer packet_id; - }; + inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } + inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { + uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; + local = _entries[ent]._is_local; + perm = _entries[ent]._permute; + if (perm) ptype = _permute_type[point]; + if (local) { + return base + _entries[ent]._byte_offset; + } else { + return cbase + _entries[ent]._byte_offset; + } + } + inline uint64_t GetPFInfo(int ent,uint64_t base) { + uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; + int local = _entries[ent]._is_local; + if (local) return base + _entries[ent]._byte_offset; + else return cbase + _entries[ent]._byte_offset; + } + + /////////////////////////////////////////////////////////// + // Unified Comms buffers for all directions + /////////////////////////////////////////////////////////// + // Vectors that live on the symmetric heap in case of SHMEM + // std::vector > u_simd_send_buf_hide; + // std::vector > u_simd_recv_buf_hide; + // commVector u_send_buf_hide; + // commVector u_recv_buf_hide; - std::vector Mergers; + // These are used; either SHM objects or refs to the above symmetric heap vectors + // depending on comms target + cobj* u_recv_buf_p; + cobj* u_send_buf_p; + std::vector u_simd_send_buf; + std::vector u_simd_recv_buf; - void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { - Merge m; - m.mpointer = merge_p; - m.rpointers= rpointers; - m.buffer_size = buffer_size; - m.packet_id = packet_id; - #ifdef SEND_IMMEDIATE - mergetime-=usecond(); - PARALLEL_FOR_LOOP - for(int o=0;o _directions; - std::vector _distances; - std::vector _comm_buf_size; - std::vector _permute_type; - - // npoints x Osites() of these - // Flat vector, change layout for cache friendly. - Vector _entries; - - inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; } - - void PrecomputeByteOffsets(void){ - for(int i=0;i<_entries.size();i++){ - if( _entries[i]._is_local ) { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); - } else { - // PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952 - _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); - } - } - }; - - inline uint64_t Touch(int ent) { - // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); - } - inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { - uint64_t cbase = (uint64_t)&comm_buf[0]; - local = _entries[ent]._is_local; - perm = _entries[ent]._permute; - if (perm) ptype = _permute_type[point]; - if (local) { - return base + _entries[ent]._byte_offset; - } else { - return cbase + _entries[ent]._byte_offset; - } - } - inline uint64_t GetPFInfo(int ent,uint64_t base) { - uint64_t cbase = (uint64_t)&comm_buf[0]; - int local = _entries[ent]._is_local; - if (local) return base + _entries[ent]._byte_offset; - else return cbase + _entries[ent]._byte_offset; - } - - // Comms buffers - std::vector > u_simd_send_buf; - std::vector > u_simd_recv_buf; - commVector u_send_buf; - commVector comm_buf; - int u_comm_offset; - int _unified_buffer_size; - - ///////////////////////////////////////// - // Timing info; ugly; possibly temporary - ///////////////////////////////////////// + ///////////////////////////////////////// + // Timing info; ugly; possibly temporary + ///////////////////////////////////////// #define TIMING_HACK #ifdef TIMING_HACK - double jointime; - double gathertime; - double commtime; - double halogtime; - double mergetime; - double spintime; - double comms_bytes; - double gathermtime; - double splicetime; - double nosplicetime; - double t_data; - double t_table; - double calls; - - void ZeroCounters(void) { - gathertime = 0.; - jointime = 0.; - commtime = 0.; - halogtime = 0.; - mergetime = 0.; - spintime = 0.; - gathermtime = 0.; - splicetime = 0.; - nosplicetime = 0.; - t_data = 0.0; - t_table= 0.0; - comms_bytes = 0.; - calls = 0.; - }; - - void Report(void) { + double jointime; + double gathertime; + double commtime; + double halogtime; + double mergetime; + double spintime; + double comms_bytes; + double gathermtime; + double splicetime; + double nosplicetime; + double t_data; + double t_table; + double calls; + + void ZeroCounters(void) { + gathertime = 0.; + jointime = 0.; + commtime = 0.; + halogtime = 0.; + mergetime = 0.; + spintime = 0.; + gathermtime = 0.; + splicetime = 0.; + nosplicetime = 0.; + t_data = 0.0; + t_table= 0.0; + comms_bytes = 0.; + calls = 0.; + }; + + void Report(void) { #define PRINTIT(A) \ std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls< 0. ) { - std::cout << GridLogMessage << " Stencil calls "<1.0){ - PRINTIT(comms_bytes); - PRINTIT(commtime); - std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "< 0. ) { + std::cout << GridLogMessage << " Stencil calls "<1.0){ + PRINTIT(comms_bytes); + PRINTIT(commtime); + std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "< &directions, - const std::vector &distances) - : _permute_type(npoints), _comm_buf_size(npoints) - { - face_table_computed=0; - _npoints = npoints; - _grid = grid; - _directions = directions; - _distances = distances; - _unified_buffer_size=0; - - int osites = _grid->oSites(); - - _entries.resize(_npoints* osites); - for(int ii=0;ii_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - _permute_type[point]=_grid->PermuteType(dimension); - - _checkerboard = checkerboard; - - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); - int rotate_dim = _grid->_simd_layout[dimension]>2; - - assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported - - int sshift[2]; - - // Underlying approach. For each local site build - // up a table containing the npoint "neighbours" and whether they - // live in lattice or a comms buffer. - if ( !comm_dim ) { - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - - if ( sshift[0] == sshift[1] ) { - Local(point,dimension,shift,0x3); - } else { - Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes - Local(point,dimension,shift,0x2);// both with block stride loop iteration - } - } else { // All permute extract done in comms phase prior to Stencil application - // So tables are the same whether comm_dim or splice_dim - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - - if ( sshift[0] == sshift[1] ) { - Comms(point,dimension,shift,0x3); - } else { - Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes - Comms(point,dimension,shift,0x2);// both with block stride loop iteration - } - } - } - u_send_buf.resize(_unified_buffer_size); - comm_buf.resize(_unified_buffer_size); - - PrecomputeByteOffsets(); - - const int Nsimd = grid->Nsimd(); - u_simd_send_buf.resize(Nsimd); - u_simd_recv_buf.resize(Nsimd); - for(int l=0;l_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int gd = _grid->_gdimensions[dimension]; - int ly = _grid->_simd_layout[dimension]; - - // Map to always positive shift modulo global full dimension. - int shift = (shiftpm+fd)%fd; - - // the permute type - int permute_dim =_grid->PermuteDim(dimension); - - for(int x=0;x_ostride[dimension]; - - int cb= (cbmask==0x2)? Odd : Even; - - int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); - int sx = (x+sshift)%rd; - - int wraparound=0; - if ( (shiftpm==-1) && (sx>x) ) { - wraparound = 1; - } - if ( (shiftpm== 1) && (sxNsimd(); - - int fd = _grid->_fdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int pd = _grid->_processors[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - - assert(comm_dim==1); - int shift = (shiftpm + fd) %fd; - assert(shift>=0); - assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored - - _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and - // send to one or more remote nodes. - - int cb= (cbmask==0x2)? Odd : Even; - int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); - - for(int x=0;xPermuteType(dimension); - - int sx = (x+sshift)%rd; - - int offnode = 0; - if ( simd_layout > 1 ) { - - for(int i=0;i>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - if ( nbr_proc ) { - offnode =1; - } - } - - } else { - int comm_proc = ((x+sshift)/rd)%pd; - offnode = (comm_proc!= 0); - } - - - int wraparound=0; - if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { - wraparound = 1; - } - if ( (shiftpm== 1) && (sx_processor_coor[dimension]==grid->_processors[dimension]-1) ) { - wraparound = 1; - } - if (!offnode) { - - int permute_slice=0; - CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); - - } else { - - int words = buffer_size; - if (cbmask != 0x3) words=words>>1; - - int rank = grid->_processor; - int recv_from_rank; - int xmit_to_rank; - - int unified_buffer_offset = _unified_buffer_size; - _unified_buffer_size += words; - - ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase - - } - } - } - // Routine builds up integer table for each site in _offsets, _is_local, _permute - void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap) - { - int rd = _grid->_rdimensions[dimension]; - - if ( !_grid->CheckerBoarded(dimension) ) { - - int o = 0; // relative offset to base within plane - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*_grid->_ostride[dimension]; // offset in buffer - - // Simple block stride gather of SIMD objects - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - int idx=point+(lo+o+b)*_npoints; - _entries[idx]._offset =ro+o+b; - _entries[idx]._permute=permute; - _entries[idx]._is_local=1; - _entries[idx]._around_the_world=wrap; - } - o +=_grid->_slice_stride[dimension]; - } - - } else { - - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - - int ocb=1<<_grid->CheckerBoardFromOindex(o+b); - - if ( ocb&cbmask ) { - int idx = point+(lo+o+b)*_npoints; - _entries[idx]._offset =ro+o+b; - _entries[idx]._is_local=1; - _entries[idx]._permute=permute; - _entries[idx]._around_the_world=wrap; - } - - } - o +=_grid->_slice_stride[dimension]; - } - - } - } - // Routine builds up integer table for each site in _offsets, _is_local, _permute - void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap) - { - int rd = _grid->_rdimensions[dimension]; - - if ( !_grid->CheckerBoarded(dimension) ) { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - // Simple block stride gather of SIMD objects - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - int idx=point+(so+o+b)*_npoints; - _entries[idx]._offset =offset+(bo++); - _entries[idx]._is_local=0; - _entries[idx]._permute=0; - _entries[idx]._around_the_world=wrap; - } - o +=_grid->_slice_stride[dimension]; - } - - } else { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - for(int n=0;n<_grid->_slice_nblock[dimension];n++){ - for(int b=0;b<_grid->_slice_block[dimension];b++){ - - int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup - if ( ocb & cbmask ) { - int idx = point+(so+o+b)*_npoints; - _entries[idx]._offset =offset+(bo++); - _entries[idx]._is_local=0; - _entries[idx]._permute =0; - _entries[idx]._around_the_world=wrap; - } - } - o +=_grid->_slice_stride[dimension]; - } - } - } - - - - template - void HaloExchange(const Lattice &source,compressor &compress) - { - calls++; - Mergers.resize(0); - Packets.resize(0); - HaloGather(source,compress); - this->Communicate(); - CommsMerge(); // spins - } -#if 0 - // Overlapping comms and compute typically slows down compute and is useless - // unless memory bandwidth greatly exceeds network - template - std::thread HaloExchangeBegin(const Lattice &source,compressor &compress) { - Mergers.resize(0); - Packets.resize(0); - HaloGather(source,compress); - return std::thread([&] { this->Communicate(); }); - } - void HaloExchangeComplete(std::thread &thr) - { - CommsMerge(); // spins - jointime-=usecond(); - thr.join(); - jointime+=usecond(); - } -#endif - template - void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) - { - int dimension = _directions[point]; - int displacement = _distances[point]; - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - - - // Map to always positive shift modulo global full dimension. - int shift = (displacement+fd)%fd; - - // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); - assert (source.checkerboard== _checkerboard); - - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); - - // Gather phase - int sshift [2]; - if ( comm_dim ) { - sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); - sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); - if ( sshift[0] == sshift[1] ) { - if (splice_dim) { - splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x3,compress,face_idx); - splicetime+=usecond(); - } else { - nosplicetime-=usecond(); - Gather(source,dimension,shift,0x3,compress,face_idx); - nosplicetime+=usecond(); - } - } else { - if(splice_dim){ - splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes - GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration - splicetime+=usecond(); - } else { - nosplicetime-=usecond(); - Gather(source,dimension,shift,0x1,compress,face_idx); - Gather(source,dimension,shift,0x2,compress,face_idx); - nosplicetime+=usecond(); - } - } - } - } - - template - void HaloGather(const Lattice &source,compressor &compress) - { - // conformable(source._grid,_grid); - assert(source._grid==_grid); - halogtime-=usecond(); - - assert (comm_buf.size() == _unified_buffer_size ); - u_comm_offset=0; - - // Gather all comms buffers - int face_idx=0; - for(int point = 0 ; point < _npoints; point++) { - compress.Point(point); - HaloGatherDir(source,compress,point,face_idx); - } - face_table_computed=1; - - assert(u_comm_offset==_unified_buffer_size); - halogtime+=usecond(); - } - - template - void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) - { - typedef typename cobj::vector_type vector_type; - typedef typename cobj::scalar_type scalar_type; - - GridBase *grid=_grid; - assert(rhs._grid==_grid); - // conformable(_grid,rhs._grid); - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int pd = _grid->_processors[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - assert(simd_layout==1); - assert(comm_dim==1); - assert(shift>=0); - assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; - - int cb= (cbmask==0x2)? Odd : Even; - int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); - - for(int x=0;x>1; - - int bytes = words * sizeof(cobj); - - gathertime-=usecond(); - int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane - if ( !face_table_computed ) { - t_table-=usecond(); - face_table.resize(face_idx+1); - Gather_plane_simple_table_compute (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,face_table[face_idx]); - t_table+=usecond(); - } - t_data-=usecond(); - Gather_plane_simple_table (face_table[face_idx],rhs,u_send_buf,compress,u_comm_offset,so); - face_idx++; - t_data+=usecond(); - gathertime+=usecond(); - - // Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data); - - int rank = _grid->_processor; - int recv_from_rank; - int xmit_to_rank; - _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - assert (xmit_to_rank != _grid->ThisRank()); - assert (recv_from_rank != _grid->ThisRank()); - - // FIXME Implement asynchronous send & also avoid buffer copy - AddPacket((void *)&u_send_buf[u_comm_offset], - (void *) &comm_buf[u_comm_offset], - xmit_to_rank, - recv_from_rank, - bytes); - - u_comm_offset+=words; - } - } - } - - - template - void GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) - { - const int Nsimd = _grid->Nsimd(); - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int pd = _grid->_processors[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - - assert(comm_dim==1); - // This will not work with a rotate dim - assert(simd_layout==2); - assert(shift>=0); - assert(shiftPermuteType(dimension); - - /////////////////////////////////////////////// - // Simd direction uses an extract/merge pair - /////////////////////////////////////////////// - int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; - int words = sizeof(cobj)/sizeof(vector_type); - - assert(cbmask==0x3); // Fixme think there is a latent bug if not true - - int bytes = buffer_size*sizeof(scalar_object); - - std::vector rpointers(Nsimd); - std::vector spointers(Nsimd); - - /////////////////////////////////////////// - // Work out what to send where - /////////////////////////////////////////// - - int cb = (cbmask==0x2)? Odd : Even; - int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); - - // loop over outer coord planes orthog to dim - for(int x=0;x= rd ); - - if ( any_offnode ) { - - for(int i=0;i(rhs,spointers,dimension,sx,cbmask,compress); - gathermtime+=usecond(); - - for(int i=0;i2 - // std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - int nbr_lcoor= (nbr_coor%ld); - int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer - int nbr_ox = (nbr_lcoor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - if (nbr_ic) nbr_lane|=inner_bit; - assert (sx == nbr_ox); - - auto rp = &u_simd_recv_buf[i ][u_comm_offset]; - auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; - - void *vrp = (void *)rp; - void *vsp = (void *)sp; - - - if(nbr_proc){ - - int recv_from_rank; - int xmit_to_rank; - - _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes); - - rpointers[i] = rp; - - } else { - - rpointers[i] = sp; - - } - } - - AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1); - - u_comm_offset +=buffer_size; - } - } - } - - }; - } + CartesianStencil(GridBase *grid, + int npoints, + int checkerboard, + const std::vector &directions, + const std::vector &distances) + : _permute_type(npoints), _comm_buf_size(npoints) + { + face_table_computed=0; + _npoints = npoints; + _grid = grid; + _directions = directions; + _distances = distances; + _unified_buffer_size=0; + + int osites = _grid->oSites(); + + _entries.resize(_npoints* osites); + for(int ii=0;ii_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + _permute_type[point]=_grid->PermuteType(dimension); + + _checkerboard = checkerboard; + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + int rotate_dim = _grid->_simd_layout[dimension]>2; + + assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported + + int sshift[2]; + + // Underlying approach. For each local site build + // up a table containing the npoint "neighbours" and whether they + // live in lattice or a comms buffer. + if ( !comm_dim ) { + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + + if ( sshift[0] == sshift[1] ) { + Local(point,dimension,shift,0x3); + } else { + Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Local(point,dimension,shift,0x2);// both with block stride loop iteration + } + } else { // All permute extract done in comms phase prior to Stencil application + // So tables are the same whether comm_dim or splice_dim + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + + if ( sshift[0] == sshift[1] ) { + Comms(point,dimension,shift,0x3); + } else { + Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Comms(point,dimension,shift,0x2);// both with block stride loop iteration + } + } + } + + ///////////////////////////////////////////////////////////////////////////////// + // Try to allocate for receiving in a shared memory region, fall back to buffer + ///////////////////////////////////////////////////////////////////////////////// + const int Nsimd = grid->Nsimd(); + + _grid->ShmBufferFreeAll(); + + u_simd_send_buf.resize(Nsimd); + u_simd_recv_buf.resize(Nsimd); + + u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + for(int l=0;lShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); + u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); + } + + PrecomputeByteOffsets(); + } + + void Local (int point, int dimension,int shiftpm,int cbmask) + { + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int gd = _grid->_gdimensions[dimension]; + int ly = _grid->_simd_layout[dimension]; + + // Map to always positive shift modulo global full dimension. + int shift = (shiftpm+fd)%fd; + + // the permute type + int permute_dim =_grid->PermuteDim(dimension); + + for(int x=0;x_ostride[dimension]; + + int cb= (cbmask==0x2)? Odd : Even; + + int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); + int sx = (x+sshift)%rd; + + int wraparound=0; + if ( (shiftpm==-1) && (sx>x) ) { + wraparound = 1; + } + if ( (shiftpm== 1) && (sxNsimd(); + + int fd = _grid->_fdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int pd = _grid->_processors[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(comm_dim==1); + int shift = (shiftpm + fd) %fd; + assert(shift>=0); + assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored + + _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and + // send to one or more remote nodes. + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); + + for(int x=0;xPermuteType(dimension); + + int sx = (x+sshift)%rd; + + int offnode = 0; + if ( simd_layout > 1 ) { + + for(int i=0;i>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + + if ( nbr_proc ) { + offnode =1; + } + } + + } else { + int comm_proc = ((x+sshift)/rd)%pd; + offnode = (comm_proc!= 0); + } + + + int wraparound=0; + if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { + wraparound = 1; + } + if ( (shiftpm== 1) && (sx_processor_coor[dimension]==grid->_processors[dimension]-1) ) { + wraparound = 1; + } + if (!offnode) { + + int permute_slice=0; + CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); + + } else { + + int words = buffer_size; + if (cbmask != 0x3) words=words>>1; + + int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + + int unified_buffer_offset = _unified_buffer_size; + _unified_buffer_size += words; + + ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase + + } + } + } + // Routine builds up integer table for each site in _offsets, _is_local, _permute + void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap) + { + int rd = _grid->_rdimensions[dimension]; + + if ( !_grid->CheckerBoarded(dimension) ) { + + int o = 0; // relative offset to base within plane + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // offset in buffer + + // Simple block stride gather of SIMD objects + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + int idx=point+(lo+o+b)*_npoints; + _entries[idx]._offset =ro+o+b; + _entries[idx]._permute=permute; + _entries[idx]._is_local=1; + _entries[idx]._around_the_world=wrap; + } + o +=_grid->_slice_stride[dimension]; + } + + } else { + + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + + int ocb=1<<_grid->CheckerBoardFromOindex(o+b); + + if ( ocb&cbmask ) { + int idx = point+(lo+o+b)*_npoints; + _entries[idx]._offset =ro+o+b; + _entries[idx]._is_local=1; + _entries[idx]._permute=permute; + _entries[idx]._around_the_world=wrap; + } + + } + o +=_grid->_slice_stride[dimension]; + } + + } + } + // Routine builds up integer table for each site in _offsets, _is_local, _permute + void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap) + { + int rd = _grid->_rdimensions[dimension]; + + if ( !_grid->CheckerBoarded(dimension) ) { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer + + // Simple block stride gather of SIMD objects + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + int idx=point+(so+o+b)*_npoints; + _entries[idx]._offset =offset+(bo++); + _entries[idx]._is_local=0; + _entries[idx]._permute=0; + _entries[idx]._around_the_world=wrap; + } + o +=_grid->_slice_stride[dimension]; + } + + } else { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer + + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + + int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup + if ( ocb & cbmask ) { + int idx = point+(so+o+b)*_npoints; + _entries[idx]._offset =offset+(bo++); + _entries[idx]._is_local=0; + _entries[idx]._permute =0; + _entries[idx]._around_the_world=wrap; + } + } + o +=_grid->_slice_stride[dimension]; + } + } + } + + template void HaloExchange(const Lattice &source,compressor &compress) + { + std::vector > reqs; + calls++; + Mergers.resize(0); + Packets.resize(0); + _grid->StencilBarrier(); + HaloGather(source,compress); + this->CommunicateBegin(reqs); + _grid->StencilBarrier(); + this->CommunicateComplete(reqs); + _grid->StencilBarrier(); + CommsMerge(); // spins + } + + template void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) + { + int dimension = _directions[point]; + int displacement = _distances[point]; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + + + // Map to always positive shift modulo global full dimension. + int shift = (displacement+fd)%fd; + + // int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); + assert (source.checkerboard== _checkerboard); + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + // Gather phase + int sshift [2]; + if ( comm_dim ) { + sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); + sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + if ( sshift[0] == sshift[1] ) { + if (splice_dim) { + splicetime-=usecond(); + GatherSimd(source,dimension,shift,0x3,compress,face_idx); + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + Gather(source,dimension,shift,0x3,compress,face_idx); + nosplicetime+=usecond(); + } + } else { + if(splice_dim){ + splicetime-=usecond(); + GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes + GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration + splicetime+=usecond(); + } else { + nosplicetime-=usecond(); + Gather(source,dimension,shift,0x1,compress,face_idx); + Gather(source,dimension,shift,0x2,compress,face_idx); + nosplicetime+=usecond(); + } + } + } + } + + template + void HaloGather(const Lattice &source,compressor &compress) + { + // conformable(source._grid,_grid); + assert(source._grid==_grid); + halogtime-=usecond(); + + u_comm_offset=0; + + // Gather all comms buffers + int face_idx=0; + for(int point = 0 ; point < _npoints; point++) { + compress.Point(point); + HaloGatherDir(source,compress,point,face_idx); + } + face_table_computed=1; + + assert(u_comm_offset==_unified_buffer_size); + halogtime+=usecond(); + } + + template + void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) + { + typedef typename cobj::vector_type vector_type; + typedef typename cobj::scalar_type scalar_type; + + GridBase *grid=_grid; + assert(rhs._grid==_grid); + // conformable(_grid,rhs._grid); + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int pd = _grid->_processors[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + + for(int x=0;x>1; + + int bytes = words * sizeof(cobj); + + gathertime-=usecond(); + int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane + if ( !face_table_computed ) { + t_table-=usecond(); + face_table.resize(face_idx+1); + Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset, + face_table[face_idx]); + t_table+=usecond(); + } + + + int rank = _grid->_processor; + int recv_from_rank; + int xmit_to_rank; + _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + + assert (xmit_to_rank != _grid->ThisRank()); + assert (recv_from_rank != _grid->ThisRank()); + + ///////////////////////////////////////////////////////// + // try the direct copy if possible + ///////////////////////////////////////////////////////// + + + cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); + if ( send_buf==NULL ) { + send_buf = u_send_buf_p; + } + // std::cout << " send_bufs "< + void GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) + { + const int Nsimd = _grid->Nsimd(); + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int pd = _grid->_processors[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(comm_dim==1); + // This will not work with a rotate dim + assert(simd_layout==2); + assert(shift>=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; + int words = sizeof(cobj)/sizeof(vector_type); + + assert(cbmask==0x3); // Fixme think there is a latent bug if not true + + int bytes = buffer_size*sizeof(scalar_object); + + std::vector rpointers(Nsimd); + std::vector spointers(Nsimd); + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + + int cb = (cbmask==0x2)? Odd : Even; + int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + + // loop over outer coord planes orthog to dim + for(int x=0;x= rd ); + + if ( any_offnode ) { + + for(int i=0;i(rhs,spointers,dimension,sx,cbmask,compress); + gathermtime+=usecond(); + + for(int i=0;i2 + // std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + int nbr_lcoor= (nbr_coor%ld); + int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer + int nbr_ox = (nbr_lcoor%rd); // outer coord of peer + int nbr_lane = (i&(~inner_bit)); + + if (nbr_ic) nbr_lane|=inner_bit; + assert (sx == nbr_ox); + + auto rp = &u_simd_recv_buf[i ][u_comm_offset]; + auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; + + if(nbr_proc){ + + int recv_from_rank; + int xmit_to_rank; + + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); + // if ((ShmDirectCopy==0)||(shm==NULL)) { + if (shm==NULL) { + shm = rp; + } + + // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node + // assuming above pointer flip + AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); + + rpointers[i] = shm; + + } else { + + rpointers[i] = sp; + + } + } + + AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1); + + u_comm_offset +=buffer_size; + } + } + } + +}; +} #endif diff --git a/lib/Threads.h b/lib/Threads.h index d502dd82..08e5d545 100644 --- a/lib/Threads.h +++ b/lib/Threads.h @@ -127,6 +127,22 @@ class GridThread { ThreadBarrier(); }; + static void bcopy(const void *src, void *dst, size_t len) { +#ifdef GRID_OMP +#pragma omp parallel + { + const char *c_src =(char *) src; + char *c_dest=(char *) dst; + int me,mywork,myoff; + GridThread::GetWorkBarrier(len,me, mywork,myoff); + bcopy(&c_src[myoff],&c_dest[myoff],mywork); + } +#else + bcopy(src,dst,len); +#endif + } + + }; } diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h index 8a24c87b..72b21ee3 100644 --- a/lib/cartesian/Cartesian_base.h +++ b/lib/cartesian/Cartesian_base.h @@ -77,7 +77,7 @@ public: // GridCartesian / GridRedBlackCartesian //////////////////////////////////////////////////////////////// virtual int CheckerBoarded(int dim)=0; - virtual int CheckerBoard(std::vector site)=0; + virtual int CheckerBoard(std::vector &site)=0; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h index 14ab8b55..b0d20441 100644 --- a/lib/cartesian/Cartesian_full.h +++ b/lib/cartesian/Cartesian_full.h @@ -49,7 +49,7 @@ public: virtual int CheckerBoarded(int dim){ return 0; } - virtual int CheckerBoard(std::vector site){ + virtual int CheckerBoard(std::vector &site){ return 0; } virtual int CheckerBoardDestination(int cb,int shift,int dim){ diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h index db0508d5..6a4300d7 100644 --- a/lib/cartesian/Cartesian_red_black.h +++ b/lib/cartesian/Cartesian_red_black.h @@ -49,7 +49,7 @@ public: if( dim==_checker_dim) return 1; else return 0; } - virtual int CheckerBoard(std::vector site){ + virtual int CheckerBoard(std::vector &site){ int linear=0; assert(site.size()==_ndimension); for(int d=0;d<_ndimension;d++){ diff --git a/lib/communicator/Communicator_base.cc b/lib/communicator/Communicator_base.cc new file mode 100644 index 00000000..8b789f4b --- /dev/null +++ b/lib/communicator/Communicator_base.cc @@ -0,0 +1,131 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/communicator/Communicator_none.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include "Grid.h" +namespace Grid { + +/////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////// +int CartesianCommunicator::ShmRank; +int CartesianCommunicator::ShmSize; +int CartesianCommunicator::GroupRank; +int CartesianCommunicator::GroupSize; +int CartesianCommunicator::WorldRank; +int CartesianCommunicator::WorldSize; +int CartesianCommunicator::Slave; +void * CartesianCommunicator::ShmCommBuf; + +///////////////////////////////// +// Alloc, free shmem region +///////////////////////////////// +void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){ + // bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes + void *ptr = (void *)heap_top; + heap_top += bytes; + heap_bytes+= bytes; + assert(heap_bytes < MAX_MPI_SHM_BYTES); + return ptr; +} +void CartesianCommunicator::ShmBufferFreeAll(void) { + heap_top =(size_t)ShmBufferSelf(); + heap_bytes=0; +} + +///////////////////////////////// +// Grid information queries +///////////////////////////////// +int CartesianCommunicator::IsBoss(void) { return _processor==0; }; +int CartesianCommunicator::BossRank(void) { return 0; }; +int CartesianCommunicator::ThisRank(void) { return _processor; }; +const std::vector & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; }; +const std::vector & CartesianCommunicator::ProcessorGrid(void) { return _processors; }; +int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; }; + +//////////////////////////////////////////////////////////////////////////////// +// very VERY rarely (Log, serial RNG) we need world without a grid +//////////////////////////////////////////////////////////////////////////////// +int CartesianCommunicator::RankWorld(void){ return WorldRank; }; +int CartesianCommunicator::Ranks (void) { return WorldSize; }; +int CartesianCommunicator::Nodes (void) { return GroupSize; }; +int CartesianCommunicator::Cores (void) { return ShmSize; }; +int CartesianCommunicator::NodeRank (void) { return GroupRank; }; +int CartesianCommunicator::CoreRank (void) { return ShmRank; }; + +void CartesianCommunicator::GlobalSum(ComplexF &c) +{ + GlobalSumVector((float *)&c,2); +} +void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) +{ + GlobalSumVector((float *)c,2*N); +} +void CartesianCommunicator::GlobalSum(ComplexD &c) +{ + GlobalSumVector((double *)&c,2); +} +void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) +{ + GlobalSumVector((double *)c,2*N); +} + +#ifndef GRID_COMMS_MPI3 + +void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes) +{ + SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); +} +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall) +{ + SendToRecvFromComplete(waitall); +} +void CartesianCommunicator::StencilBarrier(void){}; + +commVector CartesianCommunicator::ShmBufStorageVector; + +void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; } + +void *CartesianCommunicator::ShmBuffer(int rank) { + return NULL; +} +void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { + return NULL; +} +void CartesianCommunicator::ShmInitGeneric(void){ + ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); + ShmCommBuf=(void *)&ShmBufStorageVector[0]; +} + +#endif + +} + diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 9b5ae8cb..9a15d2b0 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -40,143 +40,188 @@ Author: Peter Boyle #ifdef GRID_COMMS_SHMEM #include #endif + namespace Grid { + class CartesianCommunicator { public: + // 65536 ranks per node adequate for now + // 128MB shared memory for comms enought for 48^4 local vol comms + // Give external control (command line override?) of this + + static const int MAXLOG2RANKSPERNODE = 16; + static const uint64_t MAX_MPI_SHM_BYTES = 128*1024*1024; + // Communicator should know nothing of the physics grid, only processor grid. + int _Nprocessors; // How many in all + std::vector _processors; // Which dimensions get relayed out over processors lanes. + int _processor; // linear processor rank + std::vector _processor_coor; // linear processor coordinate + unsigned long _ndimension; - int _Nprocessors; // How many in all - std::vector _processors; // Which dimensions get relayed out over processors lanes. - int _processor; // linear processor rank - std::vector _processor_coor; // linear processor coordinate - unsigned long _ndimension; - -#ifdef GRID_COMMS_MPI - MPI_Comm communicator; - typedef MPI_Request CommsRequest_t; -#elif GRID_COMMS_MPI3 - MPI_Comm communicator; - typedef MPI_Request CommsRequest_t; - - const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now - - std::vector WorldDims; - std::vector GroupDims; - std::vector ShmDims; - - std::vector GroupCoor; - std::vector ShmCoor; - std::vector WorldCoor; - - int GroupRank; - int ShmRank; - int WorldRank; - - int GroupSize; - int ShmSize; - int WorldSize; - - std::vector LexicographicToWorldRank; +#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) + MPI_Comm communicator; + static MPI_Comm communicator_world; + typedef MPI_Request CommsRequest_t; #else - typedef int CommsRequest_t; + typedef int CommsRequest_t; #endif - static void Init(int *argc, char ***argv); + //////////////////////////////////////////////////////////////////// + // Helper functionality for SHM Windows common to all other impls + //////////////////////////////////////////////////////////////////// + // Longer term; drop this in favour of a master / slave model with + // cartesian communicator on a subset of ranks, slave ranks controlled + // by group leader with data xfer via shared memory + //////////////////////////////////////////////////////////////////// +#ifdef GRID_COMMS_MPI3 + std::vector WorldDims; + std::vector GroupDims; + std::vector ShmDims; + + std::vector GroupCoor; + std::vector ShmCoor; + std::vector WorldCoor; + + static std::vector GroupRanks; + static std::vector MyGroup; + static int ShmSetup; + static MPI_Win ShmWindow; + static MPI_Comm ShmComm; + + std::vector LexicographicToWorldRank; + + static std::vector ShmCommBufs; +#else + static void ShmInitGeneric(void); + static commVector ShmBufStorageVector; +#endif + static void * ShmCommBuf; + size_t heap_top; + size_t heap_bytes; + void *ShmBufferSelf(void); + void *ShmBuffer(int rank); + void *ShmBufferTranslate(int rank,void * local_p); + void *ShmBufferMalloc(size_t bytes); + void ShmBufferFreeAll(void) ; + + //////////////////////////////////////////////// + // Must call in Grid startup + //////////////////////////////////////////////// + static void Init(int *argc, char ***argv); + + //////////////////////////////////////////////// + // Constructor of any given grid + //////////////////////////////////////////////// + CartesianCommunicator(const std::vector &pdimensions_in); + + //////////////////////////////////////////////////////////////////////////////////////// + // Wraps MPI_Cart routines, or implements equivalent on other impls + //////////////////////////////////////////////////////////////////////////////////////// + void ShiftedRanks(int dim,int shift,int & source, int & dest); + int RankFromProcessorCoor(std::vector &coor); + void ProcessorCoorFromRank(int rank,std::vector &coor); + + ///////////////////////////////// + // Grid information and queries + ///////////////////////////////// + static int ShmRank; + static int ShmSize; + static int GroupSize; + static int GroupRank; + static int WorldRank; + static int WorldSize; + static int Slave; + + int IsBoss(void) ; + int BossRank(void) ; + int ThisRank(void) ; + const std::vector & ThisProcessorCoor(void) ; + const std::vector & ProcessorGrid(void) ; + int ProcessorCount(void) ; + static int Ranks (void); + static int Nodes (void); + static int Cores (void); + static int NodeRank (void); + static int CoreRank (void); - // Constructor - CartesianCommunicator(const std::vector &pdimensions_in); + //////////////////////////////////////////////////////////////////////////////// + // very VERY rarely (Log, serial RNG) we need world without a grid + //////////////////////////////////////////////////////////////////////////////// + static int RankWorld(void) ; + static void BroadcastWorld(int root,void* data, int bytes); + + //////////////////////////////////////////////////////////// + // Reduction + //////////////////////////////////////////////////////////// + void GlobalSum(RealF &); + void GlobalSumVector(RealF *,int N); + void GlobalSum(RealD &); + void GlobalSumVector(RealD *,int N); + void GlobalSum(uint32_t &); + void GlobalSum(uint64_t &); + void GlobalSum(ComplexF &c); + void GlobalSumVector(ComplexF *c,int N); + void GlobalSum(ComplexD &c); + void GlobalSumVector(ComplexD *c,int N); + + template void GlobalSum(obj &o){ + typedef typename obj::scalar_type scalar_type; + int words = sizeof(obj)/sizeof(scalar_type); + scalar_type * ptr = (scalar_type *)& o; + GlobalSumVector(ptr,words); + } + + //////////////////////////////////////////////////////////// + // Face exchange, buffer swap in translational invariant way + //////////////////////////////////////////////////////////// + void SendToRecvFrom(void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes); + + void SendRecvPacket(void *xmit, + void *recv, + int xmit_to_rank, + int recv_from_rank, + int bytes); + + void SendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes); + + void SendToRecvFromComplete(std::vector &waitall); - // Wraps MPI_Cart routines - void ShiftedRanks(int dim,int shift,int & source, int & dest); - int RankFromProcessorCoor(std::vector &coor); - void ProcessorCoorFromRank(int rank,std::vector &coor); + void StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes); + + void StencilSendToRecvFromComplete(std::vector &waitall); + void StencilBarrier(void); - ///////////////////////////////// - // Grid information queries - ///////////////////////////////// - int IsBoss(void) { return _processor==0; }; - int BossRank(void) { return 0; }; - int ThisRank(void) { return _processor; }; - const std::vector & ThisProcessorCoor(void) { return _processor_coor; }; - const std::vector & ProcessorGrid(void) { return _processors; }; - int ProcessorCount(void) { return _Nprocessors; }; - - //////////////////////////////////////////////////////////// - // Reduction - //////////////////////////////////////////////////////////// - void GlobalSum(RealF &); - void GlobalSumVector(RealF *,int N); - - void GlobalSum(RealD &); - void GlobalSumVector(RealD *,int N); - - void GlobalSum(uint32_t &); - void GlobalSum(uint64_t &); - - void GlobalSum(ComplexF &c) - { - GlobalSumVector((float *)&c,2); - } - void GlobalSumVector(ComplexF *c,int N) - { - GlobalSumVector((float *)c,2*N); - } - - void GlobalSum(ComplexD &c) - { - GlobalSumVector((double *)&c,2); - } - void GlobalSumVector(ComplexD *c,int N) - { - GlobalSumVector((double *)c,2*N); - } - - template void GlobalSum(obj &o){ - typedef typename obj::scalar_type scalar_type; - int words = sizeof(obj)/sizeof(scalar_type); - scalar_type * ptr = (scalar_type *)& o; - GlobalSumVector(ptr,words); - } - //////////////////////////////////////////////////////////// - // Face exchange, buffer swap in translational invariant way - //////////////////////////////////////////////////////////// - void SendToRecvFrom(void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); - - void SendRecvPacket(void *xmit, - void *recv, - int xmit_to_rank, - int recv_from_rank, - int bytes); - - void SendToRecvFromBegin(std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); - void SendToRecvFromComplete(std::vector &waitall); - - //////////////////////////////////////////////////////////// - // Barrier - //////////////////////////////////////////////////////////// - void Barrier(void); - - //////////////////////////////////////////////////////////// - // Broadcast a buffer and composite larger - //////////////////////////////////////////////////////////// - void Broadcast(int root,void* data, int bytes); - template void Broadcast(int root,obj &data) + //////////////////////////////////////////////////////////// + // Barrier + //////////////////////////////////////////////////////////// + void Barrier(void); + + //////////////////////////////////////////////////////////// + // Broadcast a buffer and composite larger + //////////////////////////////////////////////////////////// + void Broadcast(int root,void* data, int bytes); + + template void Broadcast(int root,obj &data) { Broadcast(root,(void *)&data,sizeof(data)); }; - static void BroadcastWorld(int root,void* data, int bytes); - }; } diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index dff9811a..a638eebb 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -30,21 +30,30 @@ Author: Peter Boyle namespace Grid { - // Should error check all MPI calls. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// +MPI_Comm CartesianCommunicator::communicator_world; + +// Should error check all MPI calls. void CartesianCommunicator::Init(int *argc, char ***argv) { int flag; MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { MPI_Init(argc,argv); } + MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); + MPI_Comm_rank(communicator_world,&WorldRank); + MPI_Comm_size(communicator_world,&WorldSize); + ShmRank=0; + ShmSize=1; + GroupRank=WorldRank; + GroupSize=WorldSize; + Slave =0; + ShmInitGeneric(); } - int Rank(void) { - int pe; - MPI_Comm_rank(MPI_COMM_WORLD,&pe); - return pe; - } - CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { _ndimension = processors.size(); @@ -54,7 +63,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) _processors = processors; _processor_coor.resize(_ndimension); - MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator); + MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator); MPI_Comm_rank(communicator,&_processor); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); @@ -67,7 +76,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) assert(Size==_Nprocessors); } - void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); @@ -168,7 +176,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector & int nreq=list.size(); std::vector status(nreq); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); - assert(ierr==0); } @@ -187,14 +194,17 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) communicator); assert(ierr==0); } - + /////////////////////////////////////////////////////// + // Should only be used prior to Grid Init finished. + // Check for this? + /////////////////////////////////////////////////////// void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { int ierr= MPI_Bcast(data, bytes, MPI_BYTE, root, - MPI_COMM_WORLD); + communicator_world); assert(ierr==0); } diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 11734fd9..14a152ab 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -30,25 +30,199 @@ Author: Peter Boyle namespace Grid { -// Global used by Init and nowhere else. How to hide? -int Rank(void) { - int pe; - MPI_Comm_rank(MPI_COMM_WORLD,&pe); - return pe; + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// +int CartesianCommunicator::ShmSetup = 0; + +MPI_Comm CartesianCommunicator::communicator_world; +MPI_Comm CartesianCommunicator::ShmComm; +MPI_Win CartesianCommunicator::ShmWindow; + +std::vector CartesianCommunicator::GroupRanks; +std::vector CartesianCommunicator::MyGroup; +std::vector CartesianCommunicator::ShmCommBufs; + +void *CartesianCommunicator::ShmBufferSelf(void) +{ + return ShmCommBufs[ShmRank]; } - // Should error check all MPI calls. +void *CartesianCommunicator::ShmBuffer(int rank) +{ + int gpeer = GroupRanks[rank]; + if (gpeer == MPI_UNDEFINED){ + return NULL; + } else { + return ShmCommBufs[gpeer]; + } +} +void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) +{ + int gpeer = GroupRanks[rank]; + if (gpeer == MPI_UNDEFINED){ + return NULL; + } else { + uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank]; + uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset; + return (void *) remote; + } +} + void CartesianCommunicator::Init(int *argc, char ***argv) { int flag; MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { MPI_Init(argc,argv); } -} - //////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Want to implement some magic ... Group sub-cubes into those on same node - // - //////////////////////////////////////////////////////////////////////////////////////////////////////////// + MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); + MPI_Comm_rank(communicator_world,&WorldRank); + MPI_Comm_size(communicator_world,&WorldSize); + + ///////////////////////////////////////////////////////////////////// + // Split into groups that can share memory + ///////////////////////////////////////////////////////////////////// + MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm); + MPI_Comm_rank(ShmComm ,&ShmRank); + MPI_Comm_size(ShmComm ,&ShmSize); + GroupSize = WorldSize/ShmSize; + + ///////////////////////////////////////////////////////////////////// + // find world ranks in our SHM group (i.e. which ranks are on our node) + ///////////////////////////////////////////////////////////////////// + MPI_Group WorldGroup, ShmGroup; + MPI_Comm_group (communicator_world, &WorldGroup); + MPI_Comm_group (ShmComm, &ShmGroup); + + std::vector world_ranks(WorldSize); + GroupRanks.resize(WorldSize); + MyGroup.resize(ShmSize); + for(int r=0;r()); + int myleader = MyGroup[0]; + + std::vector leaders_1hot(WorldSize,0); + std::vector leaders_group(GroupSize,0); + leaders_1hot [ myleader ] = 1; + + /////////////////////////////////////////////////////////////////// + // global sum leaders over comm world + /////////////////////////////////////////////////////////////////// + int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world); + assert(ierr==0); + + /////////////////////////////////////////////////////////////////// + // find the group leaders world rank + /////////////////////////////////////////////////////////////////// + int group=0; + for(int l=0;l + for(uint64_t page=0;page coor = _processor_coor; @@ -78,27 +252,11 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &c CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { + int ierr; + + communicator=communicator_world; + _ndimension = processors.size(); - std::cout << "Creating "<< _ndimension << " dim communicator "< &processors) //////////////////////////////////////////////////////////////// int dim = 0; + std::vector WorldDims = processors; + ShmDims.resize(_ndimension,1); GroupDims.resize(_ndimension); - + ShmCoor.resize(_ndimension); GroupCoor.resize(_ndimension); WorldCoor.resize(_ndimension); + for(int l2=0;l2 &processors) for(int d=0;d<_ndimension;d++){ GroupDims[d] = WorldDims[d]/ShmDims[d]; } - std::cout << "Group dims "< world_ranks(WorldSize); - std::vector group_ranks(WorldSize); - std::vector mygroup(GroupSize); - for(int r=0;r &processors) _processors = processors; _processor_coor.resize(_ndimension); for(int i=0;i<_ndimension;i++){ - std::cout << " p " << _processors[i]<()); - int myleader = mygroup[0]; - - std::vector leaders_1hot(WorldSize,0); - std::vector leaders_group(GroupSize,0); - leaders_1hot [ myleader ] = 1; - - /////////////////////////////////////////////////////////////////// - // global sum leaders over comm world - /////////////////////////////////////////////////////////////////// - int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator); - assert(ierr==0); - - /////////////////////////////////////////////////////////////////// - // find the group leaders world rank - /////////////////////////////////////////////////////////////////// - int group=0; - for(int l=0;l &lis int from, int bytes) { +#if 0 + this->StencilBarrier(); + + MPI_Request xrq; + MPI_Request rrq; + + static int sequence; + + int ierr; + int tag; + int check; + + assert(dest != _processor); + assert(from != _processor); + + int gdest = GroupRanks[dest]; + int gfrom = GroupRanks[from]; + int gme = GroupRanks[_processor]; + + sequence++; + + char *from_ptr = (char *)ShmCommBufs[ShmRank]; + + int small = (bytesStencilBarrier(); + + if (small && (gfrom !=MPI_UNDEFINED) ) { + T *ip = (T *)from_ptr; + T *op = (T *)recv; +PARALLEL_FOR_LOOP + for(int w=0;wStencilBarrier(); + +#else MPI_Request xrq; MPI_Request rrq; int rank = _processor; @@ -318,13 +485,62 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis list.push_back(xrq); list.push_back(rrq); +#endif } + +void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int dest, + void *recv, + int from, + int bytes) +{ + MPI_Request xrq; + MPI_Request rrq; + + int ierr; + + assert(dest != _processor); + assert(from != _processor); + + int gdest = GroupRanks[dest]; + int gfrom = GroupRanks[from]; + int gme = GroupRanks[_processor]; + + assert(gme == ShmRank); + + if ( gdest == MPI_UNDEFINED ) { + ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); + assert(ierr==0); + list.push_back(xrq); + } + + if ( gfrom ==MPI_UNDEFINED) { + ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); + assert(ierr==0); + list.push_back(rrq); + } + +} + + +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list) +{ + SendToRecvFromComplete(list); +} + +void CartesianCommunicator::StencilBarrier(void) +{ + MPI_Win_sync (ShmWindow); + MPI_Barrier (ShmComm); + MPI_Win_sync (ShmWindow); +} + void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) { int nreq=list.size(); std::vector status(nreq); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); - assert(ierr==0); } @@ -350,7 +566,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) bytes, MPI_BYTE, root, - MPI_COMM_WORLD); + communicator_world); assert(ierr==0); } diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index 8601255a..198c1add 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -28,12 +28,22 @@ Author: Peter Boyle #include "Grid.h" namespace Grid { +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// + void CartesianCommunicator::Init(int *argc, char *** arv) { + WorldRank = 0; + WorldSize = 1; + ShmRank=0; + ShmSize=1; + GroupRank=WorldRank; + GroupSize=WorldSize; + Slave =0; + ShmInitGeneric(); } -int Rank(void ){ return 0; }; - CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { _processors = processors; @@ -89,30 +99,16 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector & assert(0); } -void CartesianCommunicator::Barrier(void) -{ -} - -void CartesianCommunicator::Broadcast(int root,void* data, int bytes) -{ -} -void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) -{ -} - - +void CartesianCommunicator::Barrier(void){} +void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} +void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } +int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) { return 0;} +void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor){ assert(0);} void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) { source =0; dest=0; } -int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) -{ - return 0; -} -void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor) -{ -} } diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc index 091e266e..d5f3ed76 100644 --- a/lib/communicator/Communicator_shmem.cc +++ b/lib/communicator/Communicator_shmem.cc @@ -39,17 +39,22 @@ namespace Grid { BACKTRACEFILE(); \ }\ } -int Rank(void) { - return shmem_my_pe(); -} + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// + typedef struct HandShake_t { uint64_t seq_local; uint64_t seq_remote; } HandShake; + static Vector< HandShake > XConnections; static Vector< HandShake > RConnections; + void CartesianCommunicator::Init(int *argc, char ***argv) { shmem_init(); XConnections.resize(shmem_n_pes()); @@ -60,8 +65,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { RConnections[pe].seq_local = 0; RConnections[pe].seq_remote= 0; } + WorldSize = shmem_n_pes(); + WorldRank = shmem_my_pe(); + ShmRank=0; + ShmSize=1; + GroupRank=WorldRank; + GroupSize=WorldSize; + Slave =0; shmem_barrier_all(); + ShmInitGeneric(); } + CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { _ndimension = processors.size(); @@ -230,12 +244,9 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, if ( _processor == sender ) { - printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver); // Check he has posted a receive while(SendSeq->seq_remote == SendSeq->seq_local); - printf("Sender receive %d posted\n",sender,receiver); - // Advance our send count seq = ++(SendSeq->seq_local); @@ -244,26 +255,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, shmem_putmem(recv,xmit,bytes,receiver); shmem_fence(); - printf("Sender sent payload %d\n",seq); //Notify him we're done shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver); shmem_fence(); - printf("Sender ringing door bell %d\n",seq); } if ( _processor == receiver ) { - printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver); // Post a receive seq = ++(RecvSeq->seq_local); shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender); - printf("Receiver Opening letter box %d\n",seq); - - // Now wait until he has advanced our reception counter while(RecvSeq->seq_remote != RecvSeq->seq_local); - printf("Receiver Got the mail %d\n",seq); } } diff --git a/lib/lattice/Lattice_peekpoke.h b/lib/lattice/Lattice_peekpoke.h index 9bece943..19d349c4 100644 --- a/lib/lattice/Lattice_peekpoke.h +++ b/lib/lattice/Lattice_peekpoke.h @@ -154,7 +154,7 @@ PARALLEL_FOR_LOOP template void peekLocalSite(sobj &s,const Lattice &l,std::vector &site){ - GridBase *grid=l._grid; + GridBase *grid = l._grid; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -164,16 +164,18 @@ PARALLEL_FOR_LOOP assert( l.checkerboard== l._grid->CheckerBoard(site)); assert( sizeof(sobj)*Nsimd == sizeof(vobj)); + static const int words=sizeof(vobj)/sizeof(vector_type); int odx,idx; idx= grid->iIndex(site); odx= grid->oIndex(site); - std::vector buf(Nsimd); - - extract(l._odata[odx],buf); + scalar_type * vp = (scalar_type *)&l._odata[odx]; + scalar_type * pt = (scalar_type *)&s; + + for(int w=0;wCheckerBoard(site)); assert( sizeof(sobj)*Nsimd == sizeof(vobj)); + static const int words=sizeof(vobj)/sizeof(vector_type); int odx,idx; idx= grid->iIndex(site); odx= grid->oIndex(site); - std::vector buf(Nsimd); - - // extract-modify-merge cycle is easiest way and this is not perf critical - extract(l._odata[odx],buf); + scalar_type * vp = (scalar_type *)&l._odata[odx]; + scalar_type * pt = (scalar_type *)&s; - buf[idx] = s; - - merge(l._odata[odx],buf); + for(int w=0;wGlobalIndexToGlobalCoor(gidx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); - + int l_idx=generator_idx(o_idx,i_idx); - - std::vector site_seeds(4); - for(int i=0;i<4;i++){ + + const int num_rand_seed=16; + std::vector site_seeds(num_rand_seed); + for(int i=0;i - // class MyOp : public { - // public: - // - // INHERIT_ALL_IMPL_TYPES(Impl); - // - // MyOp(MyOpParams Myparm, ImplParams &ImplParam) : Impl(ImplParam) - // { - // - // }; - // - // } - ////////////////////////////////////////////// - - - //////////////////////////////////////////////////////////////////////// - // Implementation dependent fermion types - //////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////// + // Template parameter class constructs to package + // externally control Fermion implementations + // in orthogonal directions + // + // Ultimately need Impl to always define types where XXX is opaque + // + // typedef typename XXX Simd; + // typedef typename XXX GaugeLinkField; + // typedef typename XXX GaugeField; + // typedef typename XXX GaugeActField; + // typedef typename XXX FermionField; + // typedef typename XXX DoubledGaugeField; + // typedef typename XXX SiteSpinor; + // typedef typename XXX SiteHalfSpinor; + // typedef typename XXX Compressor; + // + // and Methods: + // void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) + // void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) + // void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St) + // void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) + // void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) + // + // + // To acquire the typedefs from "Base" (either a base class or template param) use: + // + // INHERIT_GIMPL_TYPES(Base) + // INHERIT_FIMPL_TYPES(Base) + // INHERIT_IMPL_TYPES(Base) + // + // The Fermion operators will do the following: + // + // struct MyOpParams { + // RealD mass; + // }; + // + // + // template + // class MyOp : public { + // public: + // + // INHERIT_ALL_IMPL_TYPES(Impl); + // + // MyOp(MyOpParams Myparm, ImplParams &ImplParam) : Impl(ImplParam) + // { + // + // }; + // + // } + ////////////////////////////////////////////// + + //////////////////////////////////////////////////////////////////////// + // Implementation dependent fermion types + //////////////////////////////////////////////////////////////////////// + #define INHERIT_FIMPL_TYPES(Impl)\ - typedef typename Impl::FermionField FermionField; \ - typedef typename Impl::DoubledGaugeField DoubledGaugeField; \ - typedef typename Impl::SiteSpinor SiteSpinor; \ - typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \ - typedef typename Impl::Compressor Compressor; \ - typedef typename Impl::StencilImpl StencilImpl; \ - typedef typename Impl::ImplParams ImplParams; \ - typedef typename Impl::Coeff_t Coeff_t; - + typedef typename Impl::FermionField FermionField; \ + typedef typename Impl::DoubledGaugeField DoubledGaugeField; \ + typedef typename Impl::SiteSpinor SiteSpinor; \ + typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \ + typedef typename Impl::Compressor Compressor; \ + typedef typename Impl::StencilImpl StencilImpl; \ + typedef typename Impl::ImplParams ImplParams; \ + typedef typename Impl::Coeff_t Coeff_t; + #define INHERIT_IMPL_TYPES(Base) \ - INHERIT_GIMPL_TYPES(Base) \ - INHERIT_FIMPL_TYPES(Base) + INHERIT_GIMPL_TYPES(Base) \ + INHERIT_FIMPL_TYPES(Base) + + ///////////////////////////////////////////////////////////////////////////// + // Single flavour four spinors with colour index + ///////////////////////////////////////////////////////////////////////////// + template + class WilsonImpl : public PeriodicGaugeImpl > { + + public: + + static const int Dimension = Representation::Dimension; + typedef PeriodicGaugeImpl > Gimpl; + + //Necessary? + constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} - /////// - // Single flavour four spinors with colour index - /////// - template - class WilsonImpl - : public PeriodicGaugeImpl > { - public: - static const int Dimension = Representation::Dimension; - typedef PeriodicGaugeImpl > Gimpl; - - //Necessary? - constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} + const bool LsVectorised=false; + typedef _Coeff_t Coeff_t; - const bool LsVectorised=false; - typedef _Coeff_t Coeff_t; - - - INHERIT_GIMPL_TYPES(Gimpl); + INHERIT_GIMPL_TYPES(Gimpl); - template using iImplSpinor = iScalar, Ns> >; - template using iImplHalfSpinor = iScalar, Nhs> >; - template using iImplDoubledGaugeField = iVector >, Nds>; + template using iImplSpinor = iScalar, Ns> >; + template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplDoubledGaugeField = iVector >, Nds>; + + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplDoubledGaugeField SiteDoubledGaugeField; + + typedef Lattice FermionField; + typedef Lattice DoubledGaugeField; + + typedef WilsonCompressor Compressor; + typedef WilsonImplParams ImplParams; + typedef WilsonStencil StencilImpl; + + ImplParams Params; + + WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; - typedef iImplSpinor SiteSpinor; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef iImplDoubledGaugeField SiteDoubledGaugeField; + bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - typedef Lattice FermionField; - typedef Lattice DoubledGaugeField; + inline void multLink(SiteHalfSpinor &phi, + const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, + int mu, + StencilEntry *SE, + StencilImpl &St) { + mult(&phi(), &U(mu), &chi()); + } - typedef WilsonCompressor Compressor; - typedef WilsonImplParams ImplParams; - typedef WilsonStencil StencilImpl; + template + inline void loadLinkElement(Simd ®, ref &memory) { + reg = memory; + } - ImplParams Params; - - WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; - - bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - - inline void multLink(SiteHalfSpinor &phi, - const SiteDoubledGaugeField &U, - const SiteHalfSpinor &chi, - int mu, - StencilEntry *SE, - StencilImpl &St) { - mult(&phi(), &U(mu), &chi()); + inline void DoubleStore(GridBase *GaugeGrid, + DoubledGaugeField &Uds, + const GaugeField &Umu) { + conformable(Uds._grid, GaugeGrid); + conformable(Umu._grid, GaugeGrid); + GaugeLinkField U(GaugeGrid); + for (int mu = 0; mu < Nd; mu++) { + U = PeekIndex(Umu, mu); + PokeIndex(Uds, U, mu); + U = adj(Cshift(U, mu, -1)); + PokeIndex(Uds, U, mu + 4); } + } + + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ + GaugeLinkField link(mat._grid); + link = TraceIndex(outerProduct(Btilde,A)); + PokeIndex(mat,link,mu); + } - template - inline void loadLinkElement(Simd ®, - ref &memory) { - reg = memory; - } + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ - inline void DoubleStore(GridBase *GaugeGrid, - DoubledGaugeField &Uds, - const GaugeField &Umu) { - conformable(Uds._grid, GaugeGrid); - conformable(Umu._grid, GaugeGrid); - GaugeLinkField U(GaugeGrid); - for (int mu = 0; mu < Nd; mu++) { - U = PeekIndex(Umu, mu); - PokeIndex(Uds, U, mu); - U = adj(Cshift(U, mu, -1)); - PokeIndex(Uds, U, mu + 4); + int Ls=Btilde._grid->_fdimensions[0]; + GaugeLinkField tmp(mat._grid); + tmp = zero; + + PARALLEL_FOR_LOOP + for(int sss=0;sssoSites();sss++){ + int sU=sss; + for(int s=0;s(outerProduct(Btilde[sF],Atilde[sF])); // ordering here } } - - inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ - GaugeLinkField link(mat._grid); - link = TraceIndex(outerProduct(Btilde,A)); - PokeIndex(mat,link,mu); - } + PokeIndex(mat,tmp,mu); - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ - - int Ls=Btilde._grid->_fdimensions[0]; - GaugeLinkField tmp(mat._grid); - tmp = zero; + } + }; - PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - int sU=sss; - for(int s=0;s(outerProduct(Btilde[sF],Atilde[sF])); // ordering here - } - } - PokeIndex(mat,tmp,mu); + //////////////////////////////////////////////////////////////////////////////////// + // Single flavour four spinors with colour index, 5d redblack + //////////////////////////////////////////////////////////////////////////////////// +template +class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { + public: + + static const int Dimension = Nrepresentation; + const bool LsVectorised=true; + typedef _Coeff_t Coeff_t; + typedef PeriodicGaugeImpl > Gimpl; + + INHERIT_GIMPL_TYPES(Gimpl); + + template using iImplSpinor = iScalar, Ns> >; + template using iImplHalfSpinor = iScalar, Nhs> >; + template using iImplDoubledGaugeField = iVector >, Nds>; + template using iImplGaugeField = iVector >, Nd>; + template using iImplGaugeLink = iScalar > >; + + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef Lattice FermionField; + + // Make the doubled gauge field a *scalar* + typedef iImplDoubledGaugeField SiteDoubledGaugeField; // This is a scalar + typedef iImplGaugeField SiteScalarGaugeField; // scalar + typedef iImplGaugeLink SiteScalarGaugeLink; // scalar + + typedef Lattice DoubledGaugeField; + + typedef WilsonCompressor Compressor; + typedef WilsonImplParams ImplParams; + typedef WilsonStencil StencilImpl; + + ImplParams Params; + + DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){}; + + bool overlapCommsCompute(void) { return false; }; + + template + inline void loadLinkElement(Simd ®, ref &memory) { + vsplat(reg, memory); + } + + inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, int mu, StencilEntry *SE, + StencilImpl &St) { + SiteGaugeLink UU; + for (int i = 0; i < Nrepresentation; i++) { + for (int j = 0; j < Nrepresentation; j++) { + vsplat(UU()()(i, j), U(mu)()(i, j)); } - }; - - /////// - // Single flavour four spinors with colour index, 5d redblack - /////// - template - class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { - public: + } + mult(&phi(), &UU(), &chi()); + } - static const int Dimension = Nrepresentation; - const bool LsVectorised=true; - typedef _Coeff_t Coeff_t; - typedef PeriodicGaugeImpl > Gimpl; - - INHERIT_GIMPL_TYPES(Gimpl); + inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) + { + SiteScalarGaugeField ScalarUmu; + SiteDoubledGaugeField ScalarUds; + + GaugeLinkField U(Umu._grid); + GaugeField Uadj(Umu._grid); + for (int mu = 0; mu < Nd; mu++) { + U = PeekIndex(Umu, mu); + U = adj(Cshift(U, mu, -1)); + PokeIndex(Uadj, U, mu); + } + + for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { + std::vector lcoor; + GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - template using iImplSpinor = iScalar, Ns> >; - template using iImplHalfSpinor = iScalar, Nhs> >; - template using iImplDoubledGaugeField = iVector >, Nds>; - template using iImplGaugeField = iVector >, Nd>; - template using iImplGaugeLink = iScalar > >; + peekLocalSite(ScalarUmu, Umu, lcoor); + for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); - typedef iImplSpinor SiteSpinor; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef Lattice FermionField; + peekLocalSite(ScalarUmu, Uadj, lcoor); + for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); - // Make the doubled gauge field a *scalar* - typedef iImplDoubledGaugeField - SiteDoubledGaugeField; // This is a scalar - typedef iImplGaugeField - SiteScalarGaugeField; // scalar - typedef iImplGaugeLink - SiteScalarGaugeLink; // scalar + pokeLocalSite(ScalarUds, Uds, lcoor); + } + } - typedef Lattice DoubledGaugeField; + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) + { + assert(0); + } - typedef WilsonCompressor Compressor; - typedef WilsonImplParams ImplParams; - typedef WilsonStencil StencilImpl; - - ImplParams Params; - - DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){}; - - bool overlapCommsCompute(void) { return false; }; - - template - inline void loadLinkElement(Simd ®, ref &memory) { - vsplat(reg, memory); - } - inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, - const SiteHalfSpinor &chi, int mu, StencilEntry *SE, - StencilImpl &St) { - SiteGaugeLink UU; - for (int i = 0; i < Nrepresentation; i++) { - for (int j = 0; j < Nrepresentation; j++) { - vsplat(UU()()(i, j), U(mu)()(i, j)); - } - } - mult(&phi(), &UU(), &chi()); - } - - inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds, - const GaugeField &Umu) { - SiteScalarGaugeField ScalarUmu; - SiteDoubledGaugeField ScalarUds; - - GaugeLinkField U(Umu._grid); - GaugeField Uadj(Umu._grid); - for (int mu = 0; mu < Nd; mu++) { - U = PeekIndex(Umu, mu); - U = adj(Cshift(U, mu, -1)); - PokeIndex(Uadj, U, mu); - } - - for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { - std::vector lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - - peekLocalSite(ScalarUmu, Umu, lcoor); - for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); - - peekLocalSite(ScalarUmu, Uadj, lcoor); - for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); - - pokeLocalSite(ScalarUds, Uds, lcoor); - } - } - - inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, - FermionField &A, int mu) { + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField Ã, int mu) + { assert(0); - } - - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, - FermionField Ã, int mu) { - assert(0); - } - }; + } +}; //////////////////////////////////////////////////////////////////////////////////////// // Flavour doubled spinors; is Gparity the only? what about C*? //////////////////////////////////////////////////////////////////////////////////////// - template - class GparityWilsonImpl - : public ConjugateGaugeImpl > { - public: - static const int Dimension = Nrepresentation; +template +class GparityWilsonImpl : public ConjugateGaugeImpl > { + public: - const bool LsVectorised=false; + static const int Dimension = Nrepresentation; - typedef _Coeff_t Coeff_t; - typedef ConjugateGaugeImpl< GaugeImplTypes > Gimpl; + const bool LsVectorised=false; + + typedef _Coeff_t Coeff_t; + typedef ConjugateGaugeImpl< GaugeImplTypes > Gimpl; + + INHERIT_GIMPL_TYPES(Gimpl); - INHERIT_GIMPL_TYPES(Gimpl); + template using iImplSpinor = iVector, Ns>, Ngp>; + template using iImplHalfSpinor = iVector, Nhs>, Ngp>; + template using iImplDoubledGaugeField = iVector >, Nds>, Ngp>; - template - using iImplSpinor = - iVector, Ns>, Ngp>; - template - using iImplHalfSpinor = - iVector, Nhs>, Ngp>; - template - using iImplDoubledGaugeField = - iVector >, Nds>, Ngp>; + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + typedef iImplDoubledGaugeField SiteDoubledGaugeField; + + typedef Lattice FermionField; + typedef Lattice DoubledGaugeField; + + typedef WilsonCompressor Compressor; + typedef WilsonStencil StencilImpl; + + typedef GparityWilsonImplParams ImplParams; - typedef iImplSpinor SiteSpinor; - typedef iImplHalfSpinor SiteHalfSpinor; - typedef iImplDoubledGaugeField SiteDoubledGaugeField; - - typedef Lattice FermionField; - typedef Lattice DoubledGaugeField; - - typedef WilsonCompressor Compressor; - typedef WilsonStencil StencilImpl; + ImplParams Params; - typedef GparityWilsonImplParams ImplParams; - - ImplParams Params; + GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; + bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){}; + // provide the multiply by link that is differentiated between Gparity (with + // flavour index) and non-Gparity + inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, int mu, StencilEntry *SE, + StencilImpl &St) { - bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; - - // provide the multiply by link that is differentiated between Gparity (with - // flavour index) and non-Gparity - inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, - const SiteHalfSpinor &chi, int mu, StencilEntry *SE, - StencilImpl &St) { - typedef SiteHalfSpinor vobj; - typedef typename SiteHalfSpinor::scalar_object sobj; + typedef SiteHalfSpinor vobj; + typedef typename SiteHalfSpinor::scalar_object sobj; - vobj vtmp; - sobj stmp; + vobj vtmp; + sobj stmp; - GridBase *grid = St._grid; + GridBase *grid = St._grid; - const int Nsimd = grid->Nsimd(); + const int Nsimd = grid->Nsimd(); - int direction = St._directions[mu]; - int distance = St._distances[mu]; - int ptype = St._permute_type[mu]; - int sl = St._grid->_simd_layout[direction]; + int direction = St._directions[mu]; + int distance = St._distances[mu]; + int ptype = St._permute_type[mu]; + int sl = St._grid->_simd_layout[direction]; + + // Fixme X.Y.Z.T hardcode in stencil + int mmu = mu % Nd; - // Fixme X.Y.Z.T hardcode in stencil - int mmu = mu % Nd; + // assert our assumptions + assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code + assert((sl == 1) || (sl == 2)); + + std::vector icoor; - // assert our assumptions - assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code - assert((sl == 1) || (sl == 2)); - - std::vector icoor; - - if ( SE->_around_the_world && Params.twists[mmu] ) { + if ( SE->_around_the_world && Params.twists[mmu] ) { - if ( sl == 2 ) { + if ( sl == 2 ) { + + std::vector vals(Nsimd); - std::vector vals(Nsimd); + extract(chi,vals); + for(int s=0;siCoorFromIindex(icoor,s); + grid->iCoorFromIindex(icoor,s); - assert((icoor[direction]==0)||(icoor[direction]==1)); + assert((icoor[direction]==0)||(icoor[direction]==1)); - int permute_lane; - if ( distance == 1) { - permute_lane = icoor[direction]?1:0; - } else { - permute_lane = icoor[direction]?0:1; + int permute_lane; + if ( distance == 1) { + permute_lane = icoor[direction]?1:0; + } else { + permute_lane = icoor[direction]?0:1; + } + + if ( permute_lane ) { + stmp(0) = vals[s](1); + stmp(1) = vals[s](0); + vals[s] = stmp; } - - if ( permute_lane ) { - stmp(0) = vals[s](1); - stmp(1) = vals[s](0); - vals[s] = stmp; - } - } - merge(vtmp,vals); + } + merge(vtmp,vals); + + } else { + vtmp(0) = chi(1); + vtmp(1) = chi(0); + } + mult(&phi(0),&U(0)(mu),&vtmp(0)); + mult(&phi(1),&U(1)(mu),&vtmp(1)); + + } else { + mult(&phi(0),&U(0)(mu),&chi(0)); + mult(&phi(1),&U(1)(mu),&chi(1)); + } + + } - } else { - vtmp(0) = chi(1); - vtmp(1) = chi(0); - } - mult(&phi(0),&U(0)(mu),&vtmp(0)); - mult(&phi(1),&U(1)(mu),&vtmp(1)); - - } else { - mult(&phi(0),&U(0)(mu),&chi(0)); - mult(&phi(1),&U(1)(mu),&chi(1)); - } + inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) + { + conformable(Uds._grid,GaugeGrid); + conformable(Umu._grid,GaugeGrid); + + GaugeLinkField Utmp (GaugeGrid); + GaugeLinkField U (GaugeGrid); + GaugeLinkField Uconj(GaugeGrid); + + Lattice > coor(GaugeGrid); - } - - inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) - { - - conformable(Uds._grid,GaugeGrid); - conformable(Umu._grid,GaugeGrid); - - GaugeLinkField Utmp (GaugeGrid); - GaugeLinkField U (GaugeGrid); - GaugeLinkField Uconj(GaugeGrid); - - Lattice > coor(GaugeGrid); - - - for(int mu=0;mu(Umu,mu); - Uconj = conjugate(U); + U = PeekIndex(Umu,mu); + Uconj = conjugate(U); + + // This phase could come from a simple bc 1,1,-1,1 .. + int neglink = GaugeGrid->GlobalDimensions()[mu]-1; + if ( Params.twists[mu] ) { + Uconj = where(coor==neglink,-Uconj,Uconj); + } - // This phase could come from a simple bc 1,1,-1,1 .. - int neglink = GaugeGrid->GlobalDimensions()[mu]-1; - if ( Params.twists[mu] ) { - Uconj = where(coor==neglink,-Uconj,Uconj); - } +PARALLEL_FOR_LOOP + for(auto ss=U.begin();ss(outerProduct(Btilde, A)); - PARALLEL_FOR_LOOP - for (auto ss = tmp.begin(); ss < tmp.end(); ss++) { - link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1)); - } - PokeIndex(mat, link, mu); - return; - } + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) { + + // DhopDir provides U or Uconj depending on coor/flavour. + GaugeLinkField link(mat._grid); + // use lorentz for flavour as hack. + auto tmp = TraceIndex(outerProduct(Btilde, A)); +PARALLEL_FOR_LOOP + for (auto ss = tmp.begin(); ss < tmp.end(); ss++) { + link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1)); + } + PokeIndex(mat, link, mu); + return; + } - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, - FermionField Ã, int mu) { - int Ls = Btilde._grid->_fdimensions[0]; + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) { + + int Ls = Btilde._grid->_fdimensions[0]; - GaugeLinkField tmp(mat._grid); - tmp = zero; - PARALLEL_FOR_LOOP - for (int ss = 0; ss < tmp._grid->oSites(); ss++) { - for (int s = 0; s < Ls; s++) { - int sF = s + Ls * ss; - auto ttmp = traceIndex(outerProduct(Btilde[sF], Atilde[sF])); - tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); - } - } - PokeIndex(mat, tmp, mu); - return; - } - }; + GaugeLinkField tmp(mat._grid); + tmp = zero; +PARALLEL_FOR_LOOP + for (int ss = 0; ss < tmp._grid->oSites(); ss++) { + for (int s = 0; s < Ls; s++) { + int sF = s + Ls * ss; + auto ttmp = traceIndex(outerProduct(Btilde[sF], Atilde[sF])); + tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); + } + } + PokeIndex(mat, tmp, mu); + return; + } - typedef WilsonImpl WilsonImplR; // Real.. whichever prec - typedef WilsonImpl WilsonImplF; // Float - typedef WilsonImpl WilsonImplD; // Double +}; + typedef WilsonImpl WilsonImplR; // Real.. whichever prec + typedef WilsonImpl WilsonImplF; // Float + typedef WilsonImpl WilsonImplD; // Double - typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec - typedef WilsonImpl ZWilsonImplF; // Float - typedef WilsonImpl ZWilsonImplD; // Double + typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec + typedef WilsonImpl ZWilsonImplF; // Float + typedef WilsonImpl ZWilsonImplD; // Double + + typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec + typedef WilsonImpl WilsonAdjImplF; // Float + typedef WilsonImpl WilsonAdjImplD; // Double + + typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec + typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float + typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double + + typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec + typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float + typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double + + typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec + typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float + typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double + + typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec + typedef GparityWilsonImpl GparityWilsonImplF; // Float + typedef GparityWilsonImpl GparityWilsonImplD; // Double - typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec - typedef WilsonImpl WilsonAdjImplF; // Float - typedef WilsonImpl WilsonAdjImplD; // Double +}} - typedef WilsonImpl WilsonTwoIndexSymmetricImplR; // Real.. whichever prec - typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float - typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double - - typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double - - typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec - typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float - typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double - - typedef GparityWilsonImpl GparityWilsonImplR; // Real.. whichever prec - typedef GparityWilsonImpl GparityWilsonImplF; // Float - typedef GparityWilsonImpl GparityWilsonImplD; // Double -} -} #endif diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index ba7c5c8e..99baa8a0 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -222,7 +222,7 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, //////////////////////// PARALLEL_FOR_LOOP for (int sss = 0; sss < B._grid->oSites(); sss++) { - Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu, + Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma); } @@ -333,7 +333,7 @@ void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out, PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out, + Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); } }; @@ -351,13 +351,13 @@ void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, if (dag == DaggerYes) { PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in, + Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); } } else { PARALLEL_FOR_LOOP for (int sss = 0; sss < in._grid->oSites(); sss++) { - Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in, + Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); } } diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index c4c92907..4c2d24bf 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -184,44 +184,37 @@ void WilsonFermion5D::Report(void) if ( DhopCalls > 0 ) { std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime - << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " - << DhopCommTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " - << DhopComputeTime << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " - << DhopComputeTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " << DhopComputeTime << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; - RealD mflops = 1344*volume*DhopCalls/DhopComputeTime; + RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; } if ( DerivCalls > 0 ) { - std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " < 0 || DhopCalls > 0){ - std::cout << GridLogMessage << "WilsonFermion5D Stencil"<::DerivInternal(StencilImpl & st, assert(sF < B._grid->oSites()); assert(sU < U._grid->oSites()); - Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu, - gamma); + Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma); //////////////////////////// // spin trace outer product @@ -342,10 +334,10 @@ void WilsonFermion5D::DerivInternal(StencilImpl & st, } template -void WilsonFermion5D::DhopDeriv( GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag) +void WilsonFermion5D::DhopDeriv(GaugeField &mat, + const FermionField &A, + const FermionField &B, + int dag) { conformable(A._grid,FermionGrid()); conformable(A._grid,B._grid); @@ -358,9 +350,9 @@ void WilsonFermion5D::DhopDeriv( GaugeField &mat, template void WilsonFermion5D::DhopDerivEO(GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag) + const FermionField &A, + const FermionField &B, + int dag) { conformable(A._grid,FermionRedBlackGrid()); conformable(GaugeRedBlackGrid(),mat._grid); @@ -376,9 +368,9 @@ void WilsonFermion5D::DhopDerivEO(GaugeField &mat, template void WilsonFermion5D::DhopDerivOE(GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag) + const FermionField &A, + const FermionField &B, + int dag) { conformable(A._grid,FermionRedBlackGrid()); conformable(GaugeRedBlackGrid(),mat._grid); @@ -393,10 +385,9 @@ void WilsonFermion5D::DhopDerivOE(GaugeField &mat, template void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, - DoubledGaugeField & U, - const FermionField &in, FermionField &out,int dag) + DoubledGaugeField & U, + const FermionField &in, FermionField &out,int dag) { - DhopCalls++; // assert((dag==DaggerNo) ||(dag==DaggerYes)); Compressor compressor(dag); @@ -413,27 +404,25 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU = ss; int sF = LLs * sU; - Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, - out); + Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out); } #ifdef AVX512 } else if (stat.is_init() ) { int nthreads; stat.start(); - #pragma omp parallel +#pragma omp parallel { - #pragma omp master +#pragma omp master nthreads = omp_get_num_threads(); int mythread = omp_get_thread_num(); stat.enter(mythread); - #pragma omp for nowait - for(int ss=0;ssoSites();ss++) - { - int sU=ss; - int sF=LLs*sU; - Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out); - } +#pragma omp for nowait + for(int ss=0;ssoSites();ss++) { + int sU=ss; + int sF=LLs*sU; + Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); + } stat.exit(mythread); } stat.accum(nthreads); @@ -443,8 +432,7 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, for (int ss = 0; ss < U._grid->oSites(); ss++) { int sU = ss; int sF = LLs * sU; - Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in, - out); + Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out); } } DhopComputeTime+=usecond(); @@ -454,6 +442,7 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, template void WilsonFermion5D::DhopOE(const FermionField &in, FermionField &out,int dag) { + DhopCalls++; conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,out._grid); // drops the cb check @@ -465,6 +454,7 @@ void WilsonFermion5D::DhopOE(const FermionField &in, FermionField &out,int template void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) { + DhopCalls++; conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,out._grid); // drops the cb check @@ -476,6 +466,7 @@ void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int template void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) { + DhopCalls+=2; conformable(in._grid,FermionGrid()); // verifies full grid conformable(in._grid,out._grid); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index 83031b1b..ffb5c58e 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -34,8 +34,18 @@ Author: paboyle #include namespace Grid { +namespace QCD { - namespace QCD { + //////////////////////////////////////////////////////////////////////////////// + // This is the 4d red black case appropriate to support + // + // parity = (x+y+z+t)|2; + // generalised five dim fermions like mobius, zolotarev etc.. + // + // i.e. even even contains fifth dim hopping term. + // + // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ] + //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // This is the 4d red black case appropriate to support @@ -114,78 +124,78 @@ namespace Grid { // add a DhopComm // -- suboptimal interface will presently trigger multiple comms. - void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); - - /////////////////////////////////////////////////////////////// - // New methods added - /////////////////////////////////////////////////////////////// - void DerivInternal(StencilImpl & st, - DoubledGaugeField & U, - GaugeField &mat, - const FermionField &A, - const FermionField &B, - int dag); - - void DhopInternal(StencilImpl & st, - LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, - int dag); - - // Constructors - WilsonFermion5D(GaugeField &_Umu, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - GridRedBlackCartesian &FourDimRedBlackGrid, - double _M5,const ImplParams &p= ImplParams()); - - // Constructors - /* + void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); + + /////////////////////////////////////////////////////////////// + // New methods added + /////////////////////////////////////////////////////////////// + void DerivInternal(StencilImpl & st, + DoubledGaugeField & U, + GaugeField &mat, + const FermionField &A, + const FermionField &B, + int dag); + + void DhopInternal(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, + int dag); + + // Constructors + WilsonFermion5D(GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + double _M5,const ImplParams &p= ImplParams()); + + // Constructors + /* WilsonFermion5D(int simd, - GaugeField &_Umu, - GridCartesian &FiveDimGrid, - GridRedBlackCartesian &FiveDimRedBlackGrid, - GridCartesian &FourDimGrid, - double _M5,const ImplParams &p= ImplParams()); - */ + GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + double _M5,const ImplParams &p= ImplParams()); + */ + + // DoubleStore + void ImportGauge(const GaugeField &_Umu); + + /////////////////////////////////////////////////////////////// + // Data members require to support the functionality + /////////////////////////////////////////////////////////////// + public: + + // Add these to the support from Wilson + GridBase *_FourDimGrid; + GridBase *_FourDimRedBlackGrid; + GridBase *_FiveDimGrid; + GridBase *_FiveDimRedBlackGrid; + + double M5; + int Ls; + + //Defines the stencils for even and odd + StencilImpl Stencil; + StencilImpl StencilEven; + StencilImpl StencilOdd; + + // Copy of the gauge field , with even and odd subsets + DoubledGaugeField Umu; + DoubledGaugeField UmuEven; + DoubledGaugeField UmuOdd; + + LebesgueOrder Lebesgue; + LebesgueOrder LebesgueEvenOdd; + + // Comms buffer + std::vector > comm_buf; + + }; - // DoubleStore - void ImportGauge(const GaugeField &_Umu); - - /////////////////////////////////////////////////////////////// - // Data members require to support the functionality - /////////////////////////////////////////////////////////////// - public: - - // Add these to the support from Wilson - GridBase *_FourDimGrid; - GridBase *_FourDimRedBlackGrid; - GridBase *_FiveDimGrid; - GridBase *_FiveDimRedBlackGrid; - - double M5; - int Ls; - - //Defines the stencils for even and odd - StencilImpl Stencil; - StencilImpl StencilEven; - StencilImpl StencilOdd; - - // Copy of the gauge field , with even and odd subsets - DoubledGaugeField Umu; - DoubledGaugeField UmuEven; - DoubledGaugeField UmuOdd; - - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - - // Comms buffer - std::vector > comm_buf; - - }; - } -} +}} #endif diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index 58e62cc8..52ee8704 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -43,10 +43,9 @@ WilsonKernels::WilsonKernels(const ImplParams &p) : Base(p){}; //////////////////////////////////////////// template -void WilsonKernels::DiracOptGenericDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, int sF, - int sU, const FermionField &in, FermionField &out) { +void WilsonKernels::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) { SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -220,10 +219,9 @@ void WilsonKernels::DiracOptGenericDhopSiteDag( // Need controls to do interior, exterior, or both template -void WilsonKernels::DiracOptGenericDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, int sF, - int sU, const FermionField &in, FermionField &out) { +void WilsonKernels::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out) { SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteHalfSpinor *chi_p; @@ -396,10 +394,9 @@ void WilsonKernels::DiracOptGenericDhopSite( }; template -void WilsonKernels::DiracOptDhopDir( - StencilImpl &st, DoubledGaugeField &U, - commVector &buf, int sF, - int sU, const FermionField &in, FermionField &out, int dir, int gamma) { +void WilsonKernels::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF, + int sU, const FermionField &in, FermionField &out, int dir, int gamma) { + SiteHalfSpinor tmp; SiteHalfSpinor chi; SiteSpinor result; diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 66770c3c..669ee4be 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -32,175 +32,132 @@ directory #define GRID_QCD_DHOP_H namespace Grid { +namespace QCD { - namespace QCD { - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Helper routines that implement Wilson stencil for a single site. - // Common to both the WilsonFermion and WilsonFermion5D - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// - class WilsonKernelsStatic { - public: - // S-direction is INNERMOST and takes no part in the parity. - static int AsmOpt; // these are a temporary hack - static int HandOpt; // these are a temporary hack - }; - - template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { - public: - - INHERIT_IMPL_TYPES(Impl); - typedef FermionOperator Base; - - public: - - template - typename std::enable_if::type - DiracOptDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Helper routines that implement Wilson stencil for a single site. + // Common to both the WilsonFermion and WilsonFermion5D + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class WilsonKernelsStatic { + public: + // S-direction is INNERMOST and takes no part in the parity. + static int AsmOpt; // these are a temporary hack + static int HandOpt; // these are a temporary hack +}; + +template class WilsonKernels : public FermionOperator , public WilsonKernelsStatic { + public: + + INHERIT_IMPL_TYPES(Impl); + typedef FermionOperator Base; + +public: + + template + typename std::enable_if::type + DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { #ifdef AVX512 - if (AsmOpt) { - WilsonKernels::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns, - in, out); - - } else { + if (AsmOpt) { + WilsonKernels::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + } else { #else - { + { #endif - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if (HandOpt) - WilsonKernels::DiracOptHandDhopSite(st, lo, U, buf, sF, sU, - in, out); - else - WilsonKernels::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, - in, out); - sF++; - } - sU++; - } - } + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if (HandOpt) + WilsonKernels::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out); + else + WilsonKernels::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out); + sF++; } - - template - typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type - DiracOptDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, - out); - sF++; - } - sU++; - } - } - - template - typename std::enable_if::type - DiracOptDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { -#ifdef AVX512 - if (AsmOpt) { - WilsonKernels::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls, - Ns, in, out); - } else { -#else - { -#endif - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if (HandOpt) - WilsonKernels::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU, - in, out); - else - WilsonKernels::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, - sU, in, out); - sF++; - } - sU++; - } - } - } - - template - typename std::enable_if< - (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, - void>::type - DiracOptDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out) { - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - WilsonKernels::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU, - in, out); - sF++; - } - sU++; - } - } - - void DiracOptDhopDir( - StencilImpl &st, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, - int gamma); - - private: - // Specialised variants - void DiracOptGenericDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - void DiracOptGenericDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - void DiracOptAsmDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out); - - void DiracOptAsmDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, int Ls, int Ns, const FermionField &in, - FermionField &out); - - void DiracOptHandDhopSite( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - void DiracOptHandDhopSiteDag( - StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - commVector &buf, - int sF, int sU, const FermionField &in, FermionField &out); - - public: - WilsonKernels(const ImplParams &p = ImplParams()); - }; - + sU++; } } + } + + template + typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type + DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { + + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + WilsonKernels::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out); + sF++; + } + sU++; + } + } + + template + typename std::enable_if::type + DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { +#ifdef AVX512 + if (AsmOpt) { + WilsonKernels::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + } else { +#else + { +#endif + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if (HandOpt) + WilsonKernels::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else + WilsonKernels::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } + } + } + template + typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type + DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + WilsonKernels::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } + } + void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma); + +private: + // Specialised variants + void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); + + void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out); + + void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + + void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, + int sF, int sU, const FermionField &in, FermionField &out); + +public: + + WilsonKernels(const ImplParams &p = ImplParams()); + +}; + +}} #endif diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 7857e89a..74862400 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -33,31 +33,27 @@ Author: paboyle namespace Grid { - namespace QCD { +namespace QCD { - /////////////////////////////////////////////////////////// - // Default to no assembler implementation - /////////////////////////////////////////////////////////// - template - void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) - { - assert(0); - } - template - void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) - { - assert(0); - } - +/////////////////////////////////////////////////////////// +// Default to no assembler implementation +/////////////////////////////////////////////////////////// +template void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} +template void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} #if defined(AVX512) - /////////////////////////////////////////////////////////// // If we are AVX512 specialise the single precision routine /////////////////////////////////////////////////////////// @@ -65,16 +61,16 @@ namespace Grid { #include #include - static Vector signs; +static Vector signs; - int setupSigns(void ){ - Vector bother(2); - signs = bother; - vrsign(signs[0]); - visign(signs[1]); - return 1; - } - static int signInit = setupSigns(); + int setupSigns(void ){ + Vector bother(2); + signs = bother; + vrsign(signs[0]); + visign(signs[1]); + return 1; + } + static int signInit = setupSigns(); #define label(A) ilabel(A) #define ilabel(A) ".globl\n" #A ":\n" @@ -84,17 +80,15 @@ namespace Grid { #define FX(A) WILSONASM_ ##A #undef KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #define KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #undef VMOVIDUP @@ -109,31 +103,26 @@ namespace Grid { #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #undef KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #define KERNEL_DAG - template<> - void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - commVector &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #endif - #define INSTANTIATE_ASM(A)\ -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\ - commVector &buf,\ +template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ -template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\ - commVector &buf,\ + \ +template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\ - INSTANTIATE_ASM(WilsonImplF); INSTANTIATE_ASM(WilsonImplD); INSTANTIATE_ASM(ZWilsonImplF); @@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF); INSTANTIATE_ASM(DomainWallVec5dImplD); INSTANTIATE_ASM(ZDomainWallVec5dImplF); INSTANTIATE_ASM(ZDomainWallVec5dImplD); - } -} + +}} diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 9d7eac23..f5900832 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -311,10 +311,9 @@ namespace Grid { namespace QCD { - template - void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int ss,int sU,const FermionField &in, FermionField &out) +template void +WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; @@ -554,10 +553,9 @@ namespace QCD { } } - template - void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int ss,int sU,const FermionField &in, FermionField &out) +template +void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionField &in, FermionField &out) { // std::cout << "Hand op Dhop "< -void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } -template<> -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, + SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } -template<> -void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } -template<> -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - commVector &buf, - int sF,int sU,const FermionField &in, FermionField &out) +template<> void +WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int sF,int sU,const FermionField &in, FermionField &out) { assert(0); } @@ -840,12 +835,10 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st, // Need Nc=3 though // #define INSTANTIATE_THEM(A) \ -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ - commVector &buf,\ - int ss,int sU,const FermionField &in, FermionField &out);\ -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ - commVector &buf,\ - int ss,int sU,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); \ +template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ + int ss,int sU,const FermionField &in, FermionField &out); INSTANTIATE_THEM(WilsonImplF); INSTANTIATE_THEM(WilsonImplD); diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h index 0dd5a9dc..409569d8 100644 --- a/lib/qcd/action/gauge/Photon.h +++ b/lib/qcd/action/gauge/Photon.h @@ -151,12 +151,19 @@ namespace QCD{ { auto *grid = dynamic_cast(out._grid); const unsigned int nd = grid->_ndimension; + std::vector latt_size = grid->_fdimensions; GaugeLinkField sqrtK2Inv(grid), r(grid); GaugeField aTilde(grid); FFT fft(grid); + Integer vol = 1; + for(int d = 0; d < nd; d++) + { + vol = vol * latt_size[d]; + } + invKHatSquared(sqrtK2Inv); - sqrtK2Inv = sqrt(real(sqrtK2Inv)); + sqrtK2Inv = sqrt(vol*real(sqrtK2Inv)); zmSub(sqrtK2Inv); for(int mu = 0; mu < nd; mu++) { diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h index 3240098a..914c3bad 100644 --- a/lib/qcd/utils/SUn.h +++ b/lib/qcd/utils/SUn.h @@ -674,6 +674,37 @@ class SU { out += la; } } +/* + add GaugeTrans +*/ + +template + static void GaugeTransform( GaugeField &Umu, GaugeMat &g){ + GridBase *grid = Umu._grid; + conformable(grid,g._grid); + + GaugeMat U(grid); + GaugeMat ag(grid); ag = adj(g); + + for(int mu=0;mu(Umu,mu); + U = g*U*Cshift(ag, mu, 1); + PokeIndex(Umu,U,mu); + } + } + template + static void GaugeTransform( std::vector &U, GaugeMat &g){ + GridBase *grid = g._grid; + GaugeMat ag(grid); ag = adj(g); + for(int mu=0;mu + static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g){ + LieRandomize(pRNG,g,1.0); + GaugeTransform(Umu,g); + } // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 ) // inverse operation: FundamentalLieAlgebraMatrix diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h index 10022f50..03d45c07 100644 --- a/lib/qcd/utils/WilsonLoops.h +++ b/lib/qcd/utils/WilsonLoops.h @@ -522,4 +522,4 @@ typedef WilsonLoops SU3WilsonLoops; } } -#endif \ No newline at end of file +#endif diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index d1bd0190..c6443759 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -42,20 +42,14 @@ Author: paboyle namespace Grid{ namespace Optimization { - template - union uconv { - __m512 f; - vtype v; - }; - union u512f { __m512 v; - float f[8]; + float f[16]; }; union u512d { - __m512 v; - double f[4]; + __m512d v; + double f[8]; }; struct Vsplat{ @@ -388,9 +382,14 @@ namespace Optimization { // Some Template specialization // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases +<<<<<<< HEAD +#define GNU_CLANG_COMPILER +#ifdef GNU_CLANG_COMPILER +======= #ifndef __INTEL_COMPILER #warning "Slow reduction due to incomplete reduce intrinsics" +>>>>>>> develop //Complex float Reduce template<> inline Grid::ComplexF Reduce::operator()(__m512 in){ diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h index cdd54a33..19157db4 100644 --- a/lib/simd/Intel512avx.h +++ b/lib/simd/Intel512avx.h @@ -53,7 +53,7 @@ Author: paboyle #define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\ VSHUFMEMd(O,P,tmp) \ - VMULMEMd(O,P,B,Biirr) \ + VMULMEMd(O,P,B,Biirr) \ VMULMEMd(O,P,C,Ciirr) \ VMULd(tmp,B,Briir) \ VMULd(tmp,C,Criir) diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index dbfb30c2..cfa20c26 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -37,7 +37,7 @@ Author: paboyle // Opcodes common //////////////////////////////////////////////////////////////////////////////////////////////////// #define MASK_REGS \ - __asm__ ("mov $0xAAAA, %%eax \n"\ + __asm__ ("mov $0xAAAA, %%eax \n"\ "kmovw %%eax, %%k6 \n"\ "mov $0x5555, %%eax \n"\ "kmovw %%eax, %%k7 \n" : : : "%eax"); diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc index 0e35a414..1b71b8a5 100644 --- a/tests/Test_stencil.cc +++ b/tests/Test_stencil.cc @@ -116,7 +116,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) Check._odata[i] = Foo._odata[SE->_offset]; else - Check._odata[i] = myStencil.comm_buf[SE->_offset]; + Check._odata[i] = myStencil.CommBuf()[SE->_offset]; } Real nrmC = norm2(Check); @@ -207,7 +207,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) OCheck._odata[i] = EFoo._odata[SE->_offset]; else - OCheck._odata[i] = EStencil.comm_buf[SE->_offset]; + OCheck._odata[i] = EStencil.CommBuf()[SE->_offset]; } for(int i=0;ioSites();i++){ int permute_type; @@ -220,7 +220,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) ECheck._odata[i] = OFoo._odata[SE->_offset]; else - ECheck._odata[i] = OStencil.comm_buf[SE->_offset]; + ECheck._odata[i] = OStencil.CommBuf()[SE->_offset]; } setCheckerboard(Check,ECheck); diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index 562a53a1..6872f0e1 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -86,11 +86,12 @@ int main (int argc, char ** argv) FFT theFFT(&GRID); + Ctilde=C; std::cout<<" Benchmarking FFT of LatticeComplex "< QEDGimplTypesD; typedef Photon QEDGaction; @@ -450,6 +454,7 @@ int main (int argc, char ** argv) Maxwell.FreePropagator (Source,Prop); std::cout << " MaxwellFree propagator\n"; + */ } Grid_finalize(); } diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc new file mode 100644 index 00000000..1eda9a67 --- /dev/null +++ b/tests/core/Test_fft_gfix.cc @@ -0,0 +1,301 @@ + /************************************************************************************* + + grid` physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_cshift.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace Grid; +using namespace Grid::QCD; + +template +class FourierAcceleratedGaugeFixer : public Gimpl { + public: + INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Gimpl::GaugeLinkField GaugeMat; + typedef typename Gimpl::GaugeField GaugeLorentz; + + static void GaugeLinkToLieAlgebraField(const std::vector &U,std::vector &A) { + for(int mu=0;mu &A,GaugeMat &dmuAmu) { + dmuAmu=zero; + for(int mu=0;mu::avgPlaquette(Umu); + RealD org_link_trace=WilsonLoops::linkTrace(Umu); + RealD old_trace = org_link_trace; + RealD trG; + + std::vector U(Nd,grid); + GaugeMat dmuAmu(grid); + + for(int i=0;i(Umu,mu); + //trG = SteepestDescentStep(U,alpha,dmuAmu); + trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu); + for(int mu=0;mu(Umu,U[mu],mu); + // Monitor progress and convergence test + // infrequently to minimise cost overhead + if ( i %20 == 0 ) { + RealD plaq =WilsonLoops::avgPlaquette(Umu); + RealD link_trace=WilsonLoops::linkTrace(Umu); + + std::cout << GridLogMessage << " Iteration "< &U,RealD & alpha, GaugeMat & dmuAmu) { + GridBase *grid = U[0]._grid; + + std::vector A(Nd,grid); + GaugeMat g(grid); + + GaugeLinkToLieAlgebraField(U,A); + ExpiAlphaDmuAmu(A,g,alpha,dmuAmu); + + + RealD vol = grid->gSites(); + RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc; + + SU::GaugeTransform(U,g); + + return trG; + } + + static RealD FourierAccelSteepestDescentStep(std::vector &U,RealD & alpha, GaugeMat & dmuAmu) { + + GridBase *grid = U[0]._grid; + + RealD vol = grid->gSites(); + + FFT theFFT((GridCartesian *)grid); + + LatticeComplex Fp(grid); + LatticeComplex psq(grid); psq=zero; + LatticeComplex pmu(grid); + LatticeComplex one(grid); one = ComplexD(1.0,0.0); + + GaugeMat g(grid); + GaugeMat dmuAmu_p(grid); + std::vector A(Nd,grid); + + GaugeLinkToLieAlgebraField(U,A); + + DmuAmu(A,dmuAmu); + + theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward); + + ////////////////////////////////// + // Work out Fp = psq_max/ psq... + ////////////////////////////////// + std::vector latt_size = grid->GlobalDimensions(); + std::vector coor(grid->_ndimension,0); + for(int mu=0;mu::taExp(ciadmam,g); + + RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc; + + SU::GaugeTransform(U,g); + + return trG; + } + + static void ExpiAlphaDmuAmu(const std::vector &A,GaugeMat &g,RealD & alpha, GaugeMat &dmuAmu) { + GridBase *grid = g._grid; + ComplexD cialpha(0.0,-alpha); + GaugeMat ciadmam(grid); + DmuAmu(A,dmuAmu); + ciadmam = dmuAmu*cialpha; + SU::taExp(ciadmam,g); + } +/* + //////////////////////////////////////////////////////////////// + // NB The FT for fields living on links has an extra phase in it + // Could add these to the FFT class as a later task since this code + // might be reused elsewhere ???? + //////////////////////////////////////////////////////////////// + static void InverseFourierTransformAmu(FFT &theFFT,const std::vector &Ap,std::vector &Ax) { + GridBase * grid = theFFT.Grid(); + std::vector latt_size = grid->GlobalDimensions(); + + ComplexField pmu(grid); + ComplexField pha(grid); + GaugeMat Apha(grid); + + ComplexD ci(0.0,1.0); + + for(int mu=0;mu &Ax,std::vector &Ap) { + GridBase * grid = theFFT.Grid(); + std::vector latt_size = grid->GlobalDimensions(); + + ComplexField pmu(grid); + ComplexField pha(grid); + ComplexD ci(0.0,1.0); + + // Sign convention for FFTW calls: + // A(x)= Sum_p e^ipx A(p) / V + // A(p)= Sum_p e^-ipx A(x) + + for(int mu=0;mu seeds({1,2,3,4}); + + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout( { vComplexD::Nsimd(),1,1,1}); + std::vector mpi_layout = GridDefaultMpi(); + + int vol = 1; + for(int d=0;d::avgPlaquette(Umu); + std::cout << " Initial plaquette "<::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10); + + + plaq=WilsonLoops::avgPlaquette(Umu); + std::cout << " Final plaquette "<