From 6616d5d09012cb7097e2703d19dfb039bae8a539 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 2 Feb 2022 16:38:24 -0500 Subject: [PATCH 001/240] Commit --- Grid/stencil/Stencil.h | 107 ++++++++++++++++++++++++++++++++++------- 1 file changed, 90 insertions(+), 17 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index c2bc8dab..269ecfe4 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -240,6 +240,19 @@ public: cobj * mpi_p; Integer buffer_size; }; + struct CopyReceiveBuffer { + void * from_p; + void * to_p; + Integer bytes; + }; + struct CachedTransfer { + Integer direction; + Integer OrthogPlane; + Integer DestProc; + Integer bytes; + Integer lane; + void *recv_buf; + }; protected: @@ -271,7 +284,8 @@ public: std::vector MergersSHM; std::vector Decompressions; std::vector DecompressionsSHM; - + std::vector CopyReceiveBuffers ; + std::vector CachedTransfers; /////////////////////////////////////////////////////////// // Unified Comms buffers for all directions /////////////////////////////////////////////////////////// @@ -551,8 +565,57 @@ public: Mergers.resize(0); MergersSHM.resize(0); Packets.resize(0); + CopyReceiveBuffers.resize(0); + CachedTransfers.resize(0); calls++; } + void AddCopy(void *from,void * to, Integer bytes) + { + CopyReceiveBuffer obj; + obj.from_p = from; + obj.to_p = to; + obj.bytes= bytes; + CopyReceiveBuffers.push_back(obj); + } + void CommsCopy() + { + // These are device resident MPI buffers. + for(int i=0;i void CommsMerge(decompressor decompress) { + CommsCopy(); CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { @@ -590,8 +654,8 @@ public: } template - void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { - + void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) + { mergetime-=usecond(); for(int i=0;i>1; @@ -1045,9 +1111,10 @@ public: recv_buf=this->u_recv_buf_p; } + cobj *send_buf; send_buf = this->u_send_buf_p; // Gather locally, must send - + //////////////////////////////////////////////////////// // Gather locally //////////////////////////////////////////////////////// @@ -1056,23 +1123,27 @@ public: Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; gathertime+=usecond(); - /////////////////////////////////////////////////////////// - // Build a list of things to do after we synchronise GPUs - // Start comms now??? - /////////////////////////////////////////////////////////// - AddPacket((void *)&send_buf[u_comm_offset], - (void *)&recv_buf[u_comm_offset], - xmit_to_rank, - recv_from_rank, - bytes); + int duplicate = CheckForDuplicate(dimension,x,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes); + if (!duplicate || 1) { // Force comms for now + /////////////////////////////////////////////////////////// + // Build a list of things to do after we synchronise GPUs + // Start comms now??? + /////////////////////////////////////////////////////////// + AddPacket((void *)&send_buf[u_comm_offset], + (void *)&recv_buf[u_comm_offset], + xmit_to_rank, + recv_from_rank, + bytes); + } + if ( compress.DecompressionStep() ) { AddDecompress(&this->u_recv_buf_p[u_comm_offset], &recv_buf[u_comm_offset], words,Decompressions); } u_comm_offset+=words; - } + } } return 0; } @@ -1181,8 +1252,10 @@ public: rpointers[i] = rp; - AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - + int duplicate = CheckForDuplicate(dimension,x,nbr_proc,(void *)rp,i,bytes); + if (!duplicate || 1 ) { // Force comms for now + AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); + } } else { From 6283d11d5042ef36d306b28dfafe47ee7eab9d23 Mon Sep 17 00:00:00 2001 From: Azusa Yamaguchi Date: Tue, 8 Feb 2022 15:22:06 +0000 Subject: [PATCH 002/240] Add the comment line to tell the existance of copied data/buffer --- Grid/stencil/Stencil.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 269ecfe4..d39345f7 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -571,6 +571,7 @@ public: } void AddCopy(void *from,void * to, Integer bytes) { + std::cout << "Adding CopyReceiveBuffer "< Date: Tue, 15 Feb 2022 10:27:39 -0500 Subject: [PATCH 003/240] Bug fix to detection case --- Grid/stencil/Stencil.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index d39345f7..5a2d5099 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -601,6 +601,7 @@ public: obj.recv_buf = recv_buf; obj.bytes = bytes; obj.lane = lane; + for(int i=0;i Date: Thu, 17 Feb 2022 04:51:13 +0000 Subject: [PATCH 004/240] Staggered fix finished --- Grid/stencil/Stencil.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 5a2d5099..246bdb36 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -251,6 +251,7 @@ public: Integer DestProc; Integer bytes; Integer lane; + Integer cb; void *recv_buf; }; @@ -571,7 +572,7 @@ public: } void AddCopy(void *from,void * to, Integer bytes) { - std::cout << "Adding CopyReceiveBuffer "< Date: Tue, 22 Feb 2022 19:58:33 +0000 Subject: [PATCH 005/240] Dirichlet first cut - wrong answers on dagger multiply. Struggling to get a compute node so changing systems --- Grid/communicator/Communicator_base.h | 16 +- Grid/communicator/Communicator_mpi3.cc | 58 ++-- Grid/communicator/Communicator_none.cc | 10 +- Grid/communicator/SharedMemory.h | 7 +- Grid/communicator/SharedMemoryMPI.cc | 15 +- Grid/communicator/SharedMemoryNone.cc | 3 +- Grid/qcd/action/fermion/WilsonFermion5D.h | 7 +- Grid/stencil/Stencil.h | 273 +++++++++--------- benchmarks/Benchmark_comms.cc | 20 +- benchmarks/Benchmark_dwf_fp32.cc | 219 ++++++++++----- systems/Tursa/dwf.4node.perf | 328 +++++++++++----------- systems/Tursa/dwf4.slurm | 7 +- 12 files changed, 523 insertions(+), 440 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index ffcfe37a..d4f12f86 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -53,10 +53,11 @@ public: // Communicator should know nothing of the physics grid, only processor grid. //////////////////////////////////////////// int _Nprocessors; // How many in all - Coordinate _processors; // Which dimensions get relayed out over processors lanes. int _processor; // linear processor rank - Coordinate _processor_coor; // linear processor coordinate unsigned long _ndimension; + Coordinate _shm_processors; // Which dimensions get relayed out over processors lanes. + Coordinate _processors; // Which dimensions get relayed out over processors lanes. + Coordinate _processor_coor; // linear processor coordinate static Grid_MPI_Comm communicator_world; Grid_MPI_Comm communicator; std::vector communicator_halo; @@ -97,8 +98,9 @@ public: int BossRank(void) ; int ThisRank(void) ; const Coordinate & ThisProcessorCoor(void) ; + const Coordinate & ShmGrid(void) { return _shm_processors; } ; const Coordinate & ProcessorGrid(void) ; - int ProcessorCount(void) ; + int ProcessorCount(void) ; //////////////////////////////////////////////////////////////////////////////// // very VERY rarely (Log, serial RNG) we need world without a grid @@ -142,16 +144,16 @@ public: int bytes); double StencilSendToRecvFrom(void *xmit, - int xmit_to_rank, + int xmit_to_rank,int do_xmit, void *recv, - int recv_from_rank, + int recv_from_rank,int do_recv, int bytes,int dir); double StencilSendToRecvFromBegin(std::vector &list, void *xmit, - int xmit_to_rank, + int xmit_to_rank,int do_xmit, void *recv, - int recv_from_rank, + int recv_from_rank,int do_recv, int bytes,int dir); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 7b3e8847..ecdf1e53 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) // Remap using the shared memory optimising routine // The remap creates a comm which must be freed //////////////////////////////////////////////////// - GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm); + GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm,_shm_processors); InitFromMPICommunicator(processors,optimal_comm); SetCommunicator(optimal_comm); /////////////////////////////////////////////////// @@ -124,12 +124,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); Coordinate parent_processor_coor(_ndimension,0); Coordinate parent_processors (_ndimension,1); - + Coordinate shm_processors (_ndimension,1); // Can make 5d grid from 4d etc... int pad = _ndimension-parent_ndimension; for(int d=0;d list; - double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); + double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,dir); StencilSendToRecvFromComplete(list,dir); return offbytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, - int dest, + int dest,int dox, void *recv, - int from, + int from,int dor, int bytes,int dir) { int ncomm =communicator_halo.size(); @@ -370,28 +372,32 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); - assert(shm!=NULL); - // std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl; - acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); + void *shm = (void *) this->ShmBufferTranslate(dest,recv); + assert(shm!=NULL); + // std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl; + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); + } } - + if ( CommunicatorPolicy == CommunicatorPolicySequential ) { this->StencilSendToRecvFromComplete(list,dir); } diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index beb2cc97..a0f33ca4 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -45,12 +45,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) : CartesianCommunicator(processors) { + _shm_processors = Coordinate(processors.size(),1); srank=0; SetCommunicator(communicator_world); } CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) { + _shm_processors = Coordinate(processors.size(),1); _processors = processors; _ndimension = processors.size(); assert(_ndimension>=1); _processor_coor.resize(_ndimension); @@ -111,18 +113,18 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest } double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, - int xmit_to_rank, + int xmit_to_rank,int dox, void *recv, - int recv_from_rank, + int recv_from_rank,int dor, int bytes, int dir) { return 2.0*bytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, - int xmit_to_rank, + int xmit_to_rank,int dox, void *recv, - int recv_from_rank, + int recv_from_rank,int dor, int bytes, int dir) { return 2.0*bytes; diff --git a/Grid/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h index f2d20a24..d55fbf3d 100644 --- a/Grid/communicator/SharedMemory.h +++ b/Grid/communicator/SharedMemory.h @@ -93,9 +93,10 @@ public: // Create an optimal reordered communicator that makes MPI_Cart_create get it right ////////////////////////////////////////////////////////////////////////////////////// static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD - static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian - static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian - static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian + // Turns MPI_COMM_WORLD into right layout for Cartesian + static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); + static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); + static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims); /////////////////////////////////////////////////// // Provide shared memory facilities off comm world diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 795f3928..fe2f2d89 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -152,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2) } return log2size; } -void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) +void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) { ////////////////////////////////////////////////////////////////////////////// // Look and see if it looks like an HPE 8600 based on hostname conventions @@ -165,8 +165,8 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M gethostname(name,namelen); int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; - if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm); - else OptimalCommunicatorSharedMemory(processors,optimal_comm); + if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM); + else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM); } static inline int divides(int a,int b) { @@ -221,7 +221,7 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD dim=(dim+1) %ndimension; } } -void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) +void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) { //////////////////////////////////////////////////////////////// // Assert power of two shm_size. @@ -294,7 +294,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo Coordinate HyperCoor(ndimension); GetShmDims(WorldDims,ShmDims); - + SHM = ShmDims; + //////////////////////////////////////////////////////////////// // Establish torus of processes and nodes with sub-blockings //////////////////////////////////////////////////////////////// @@ -341,7 +342,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); assert(ierr==0); } -void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) +void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) { //////////////////////////////////////////////////////////////// // Identify subblock of ranks on node spreading across dims @@ -353,6 +354,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension); GetShmDims(WorldDims,ShmDims); + SHM=ShmDims; + //////////////////////////////////////////////////////////////// // Establish torus of processes and nodes with sub-blockings //////////////////////////////////////////////////////////////// diff --git a/Grid/communicator/SharedMemoryNone.cc b/Grid/communicator/SharedMemoryNone.cc index 35663632..198a59d2 100644 --- a/Grid/communicator/SharedMemoryNone.cc +++ b/Grid/communicator/SharedMemoryNone.cc @@ -48,9 +48,10 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) _ShmSetup=1; } -void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) +void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) { optimal_comm = WorldComm; + SHM = Coordinate(processors.size(),1); } //////////////////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 80231bb4..affdae10 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -173,7 +173,12 @@ public: GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, double _M5,const ImplParams &p= ImplParams()); - + + void DirichletBlock(std::vector & block){ + Stencil.DirichletBlock(block); + StencilEven.DirichletBlock(block); + StencilOdd.DirichletBlock(block); + } // Constructors /* WilsonFermion5D(int simd, diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 246bdb36..930957d8 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -131,8 +131,11 @@ class CartesianStencilAccelerator { int _checkerboard; int _npoints; // Move to template param? int _osites; + int _dirichlet; StencilVector _directions; StencilVector _distances; + StencilVector _comms_send; + StencilVector _comms_recv; StencilVector _comm_buf_size; StencilVector _permute_type; StencilVector same_node; @@ -226,6 +229,8 @@ public: void * recv_buf; Integer to_rank; Integer from_rank; + Integer do_send; + Integer do_recv; Integer bytes; }; struct Merge { @@ -255,7 +260,6 @@ public: void *recv_buf; }; - protected: GridBase * _grid; @@ -299,29 +303,6 @@ public: int u_comm_offset; int _unified_buffer_size; - ///////////////////////////////////////// - // Timing info; ugly; possibly temporary - ///////////////////////////////////////// - double commtime; - double mpi3synctime; - double mpi3synctime_g; - double shmmergetime; - double gathertime; - double gathermtime; - double halogtime; - double mergetime; - double decompresstime; - double comms_bytes; - double shm_bytes; - double splicetime; - double nosplicetime; - double calls; - std::vector comm_bytes_thr; - std::vector shm_bytes_thr; - std::vector comm_time_thr; - std::vector comm_enter_thr; - std::vector comm_leave_thr; - //////////////////////////////////////// // Stencil query //////////////////////////////////////// @@ -348,11 +329,12 @@ public: ////////////////////////////////////////// // Comms packet queue for asynch thread // Use OpenMP Tasks for cleaner ??? + // must be called *inside* parallel region ////////////////////////////////////////// + /* void CommunicateThreaded() { #ifdef GRID_OMP - // must be called in parallel region int mythread = omp_get_thread_num(); int nthreads = CartesianCommunicator::nCommThreads; #else @@ -361,65 +343,29 @@ public: #endif if (nthreads == -1) nthreads = 1; if (mythread < nthreads) { - comm_enter_thr[mythread] = usecond(); for (int i = mythread; i < Packets.size(); i += nthreads) { uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, Packets[i].to_rank, Packets[i].recv_buf, Packets[i].from_rank, Packets[i].bytes,i); - comm_bytes_thr[mythread] += bytes; - shm_bytes_thr[mythread] += 2*Packets[i].bytes-bytes; // Send + Recv. - } - comm_leave_thr[mythread]= usecond(); - comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; } } - - void CollateThreads(void) - { - int nthreads = CartesianCommunicator::nCommThreads; - double first=0.0; - double last =0.0; - - for(int t=0;t 0.0) && ( t0 < first ) ) first = t0; // min time seen - - if ( t1 > last ) last = t1; // max time seen - - } - commtime+= last-first; - } + */ //////////////////////////////////////////////////////////////////////// // Non blocking send and receive. Necessarily parallel. //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { reqs.resize(Packets.size()); - commtime-=usecond(); for(int i=0;iStencilSendToRecvFromBegin(reqs[i], - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - comms_bytes+=bytes; - shm_bytes +=2*Packets[i].bytes-bytes; + _grid->StencilSendToRecvFromBegin(reqs[i], + Packets[i].send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].bytes,i); } } @@ -428,7 +374,6 @@ public: for(int i=0;iStencilSendToRecvFromComplete(reqs[i],i); } - commtime+=usecond(); } //////////////////////////////////////////////////////////////////////// // Blocking send and receive. Either sequential or parallel. @@ -436,28 +381,27 @@ public: void Communicate(void) { if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){ - thread_region { - // must be called in parallel region - int mythread = thread_num(); - int maxthreads= thread_max(); - int nthreads = CartesianCommunicator::nCommThreads; - assert(nthreads <= maxthreads); - if (nthreads == -1) nthreads = 1; - if (mythread < nthreads) { - for (int i = mythread; i < Packets.size(); i += nthreads) { - double start = usecond(); - uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - comm_bytes_thr[mythread] += bytes; - shm_bytes_thr[mythread] += Packets[i].bytes - bytes; - comm_time_thr[mythread] += usecond() - start; - } - } - } - } else { // Concurrent and non-threaded asynch calls to MPI + ///////////////////////////////////////////////////////// + // several way threaded on different communicators. + // Cannot combine with Dirichlet operators + // This scheme is needed on Intel Omnipath for best performance + // Deprecate once there are very few omnipath clusters + ///////////////////////////////////////////////////////// + int nthreads = CartesianCommunicator::nCommThreads; + int old = GridThread::GetThreads(); + GridThread::SetThreads(nthreads); + thread_for(i,Packets.size(),{ + _grid->StencilSendToRecvFrom(Packets[i].send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].bytes,i); + }); + GridThread::SetThreads(old); + } else { + ///////////////////////////////////////////////////////// + // Concurrent and non-threaded asynch calls to MPI + ///////////////////////////////////////////////////////// std::vector > reqs; this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); @@ -499,31 +443,23 @@ public: sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd); if ( sshift[0] == sshift[1] ) { if (splice_dim) { - splicetime-=usecond(); - auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx); + auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx,point); is_same_node = is_same_node && tmp; - splicetime+=usecond(); } else { - nosplicetime-=usecond(); - auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx); + auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx,point); is_same_node = is_same_node && tmp; - nosplicetime+=usecond(); } } else { if(splice_dim){ - splicetime-=usecond(); // if checkerboard is unfavourable take two passes // both with block stride loop iteration - auto tmp1 = GatherSimd(source,dimension,shift,0x1,compress,face_idx); - auto tmp2 = GatherSimd(source,dimension,shift,0x2,compress,face_idx); + auto tmp1 = GatherSimd(source,dimension,shift,0x1,compress,face_idx,point); + auto tmp2 = GatherSimd(source,dimension,shift,0x2,compress,face_idx,point); is_same_node = is_same_node && tmp1 && tmp2; - splicetime+=usecond(); } else { - nosplicetime-=usecond(); - auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx); - auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx); + auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx,point); + auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx,point); is_same_node = is_same_node && tmp1 && tmp2; - nosplicetime+=usecond(); } } } @@ -533,13 +469,10 @@ public: template void HaloGather(const Lattice &source,compressor &compress) { - mpi3synctime_g-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes - mpi3synctime_g+=usecond(); // conformable(source.Grid(),_grid); assert(source.Grid()==_grid); - halogtime-=usecond(); u_comm_offset=0; @@ -553,7 +486,6 @@ public: assert(u_comm_offset==_unified_buffer_size); accelerator_barrier(); - halogtime+=usecond(); } ///////////////////////// @@ -568,7 +500,6 @@ public: Packets.resize(0); CopyReceiveBuffers.resize(0); CachedTransfers.resize(0); - calls++; } void AddCopy(void *from,void * to, Integer bytes) { @@ -622,12 +553,17 @@ public: CachedTransfers.push_back(obj); return 0; } - void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ + void AddPacket(void *xmit,void * rcv, + Integer to, Integer do_send, + Integer from, Integer do_recv, + Integer bytes){ Packet p; p.send_buf = xmit; p.recv_buf = rcv; p.to_rank = to; p.from_rank= from; + p.do_send = do_send; + p.do_recv = do_recv; p.bytes = bytes; Packets.push_back(p); } @@ -651,19 +587,13 @@ public: CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { - mpi3synctime-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes - mpi3synctime+=usecond(); - shmmergetime-=usecond(); CommsMerge(decompress,MergersSHM,DecompressionsSHM); - shmmergetime+=usecond(); } template void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { - - mergetime-=usecond(); for(int i=0;i &dirichlet_block) + { + this->_dirichlet = 1; + for(int ii=0;ii_npoints;ii++){ + int dimension = this->_directions[ii]; + int displacement = this->_distances[ii]; + int shift = displacement; + int gd = _grid->_gdimensions[dimension]; + int fd = _grid->_fdimensions[dimension]; + int pd = _grid->_processors [dimension]; + int ld = gd/pd; + int pc = _grid->_processor_coor[dimension]; + /////////////////////////////////////////// + // Figure out dirichlet send and receive + // on this leg of stencil. + /////////////////////////////////////////// + int comm_dim = _grid->_processors[dimension] >1 ; + int block = dirichlet_block[dimension]; + this->_comms_send[ii] = comm_dim; + this->_comms_recv[ii] = comm_dim; + if ( block ) { + assert(abs(displacement) < ld ); + + if( displacement > 0 ) { + // High side, low side + // | <--B--->| + // | | | + // noR + // noS + if ( (ld*(pc+1) ) % block == 0 ) this->_comms_recv[ii] = 0; + if ( ( ld*pc ) % block == 0 ) this->_comms_send[ii] = 0; + } else { + // High side, low side + // | <--B--->| + // | | | + // noS + // noR + if ( (ld*(pc+1) ) % block == 0 ) this->_comms_send[ii] = 0; + if ( ( ld*pc ) % block == 0 ) this->_comms_recv[ii] = 0; + } + } + } + } CartesianStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, const std::vector &distances, Parameters p) - : shm_bytes_thr(npoints), - comm_bytes_thr(npoints), - comm_enter_thr(npoints), - comm_leave_thr(npoints), - comm_time_thr(npoints) { + this->_dirichlet = 0; face_table_computed=0; _grid = grid; this->parameters=p; @@ -745,6 +711,8 @@ public: this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels this->_directions = StencilVector(directions); this->_distances = StencilVector(distances); + this->_comms_send.resize(npoints); + this->_comms_recv.resize(npoints); this->same_node.resize(npoints); _unified_buffer_size=0; @@ -763,24 +731,27 @@ public: int displacement = distances[i]; int shift = displacement; + int gd = _grid->_gdimensions[dimension]; int fd = _grid->_fdimensions[dimension]; + int pd = _grid->_processors [dimension]; + int ld = gd/pd; int rd = _grid->_rdimensions[dimension]; + int pc = _grid->_processor_coor[dimension]; this->_permute_type[point]=_grid->PermuteType(dimension); this->_checkerboard = checkerboard; - ////////////////////////// - // the permute type - ////////////////////////// int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); int rotate_dim = _grid->_simd_layout[dimension]>2; + this->_comms_send[ii] = comm_dim; + this->_comms_recv[ii] = comm_dim; + assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported int sshift[2]; - ////////////////////////// // Underlying approach. For each local site build // up a table containing the npoint "neighbours" and whether they @@ -881,6 +852,7 @@ public: GridBase *grid=_grid; const int Nsimd = grid->Nsimd(); + int comms_recv = this->_comms_recv[point]; int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -937,7 +909,9 @@ public: if ( (shiftpm== 1) && (sx_processor_coor[dimension]==grid->_processors[dimension]-1) ) { wraparound = 1; } - if (!offnode) { + + // Wrap locally dirichlet support case OR node local + if ( (offnode==0) || (comms_recv==0) ) { int permute_slice=0; CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); @@ -1054,11 +1028,14 @@ public: } template - int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) + int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx, int point) { typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; + int comms_send = this->_comms_send[point] ; + int comms_recv = this->_comms_recv[point] ; + assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); @@ -1124,10 +1101,10 @@ public: //////////////////////////////////////////////////////// // Gather locally //////////////////////////////////////////////////////// - gathertime-=usecond(); assert(send_buf!=NULL); - Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; - gathertime+=usecond(); + if ( comms_send ) + Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); + face_idx++; int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes,cbmask); if ( (!duplicate) ) { // Force comms for now @@ -1138,12 +1115,12 @@ public: /////////////////////////////////////////////////////////// AddPacket((void *)&send_buf[u_comm_offset], (void *)&recv_buf[u_comm_offset], - xmit_to_rank, - recv_from_rank, + xmit_to_rank, comms_send, + recv_from_rank, comms_recv, bytes); } - if ( compress.DecompressionStep() ) { + if ( compress.DecompressionStep() && comms_recv ) { AddDecompress(&this->u_recv_buf_p[u_comm_offset], &recv_buf[u_comm_offset], words,Decompressions); @@ -1155,11 +1132,15 @@ public: } template - int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) + int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx,int point) { const int Nsimd = _grid->Nsimd(); const int maxl =2;// max layout in a direction + + int comms_send = this->_comms_send[point] ; + int comms_recv = this->_comms_recv[point] ; + int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; @@ -1224,12 +1205,11 @@ public: &face_table[face_idx][0], face_table[face_idx].size()*sizeof(face_table_host[0])); } - gathermtime-=usecond(); - Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); + if ( comms_send ) + Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); face_idx++; - gathermtime+=usecond(); //spointers[0] -- low //spointers[1] -- high @@ -1260,7 +1240,10 @@ public: int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,bytes,cbmask); if ( (!duplicate) ) { // Force comms for now - AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); + AddPacket((void *)sp,(void *)rp, + xmit_to_rank,comms_send, + recv_from_rank,comms_recv, + bytes); } } else { @@ -1270,7 +1253,9 @@ public: } } - AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); + if ( comms_recv ) { + AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); + } u_comm_offset +=buffer_size; } diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index ccffb564..21e048f4 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -217,9 +217,9 @@ int main (int argc, char ** argv) dbytes+= Grid.StencilSendToRecvFromBegin(requests, (void *)&xbuf[mu][0], - xmit_to_rank, + xmit_to_rank,1, (void *)&rbuf[mu][0], - recv_from_rank, + recv_from_rank,1, bytes,mu); comm_proc = mpi_layout[mu]-1; @@ -228,9 +228,9 @@ int main (int argc, char ** argv) dbytes+= Grid.StencilSendToRecvFromBegin(requests, (void *)&xbuf[mu+4][0], - xmit_to_rank, + xmit_to_rank,1, (void *)&rbuf[mu+4][0], - recv_from_rank, + recv_from_rank,1, bytes,mu+4); } @@ -309,9 +309,9 @@ int main (int argc, char ** argv) dbytes+= Grid.StencilSendToRecvFromBegin(requests, (void *)&xbuf[mu][0], - xmit_to_rank, + xmit_to_rank,1, (void *)&rbuf[mu][0], - recv_from_rank, + recv_from_rank,1, bytes,mu); Grid.StencilSendToRecvFromComplete(requests,mu); requests.resize(0); @@ -322,9 +322,9 @@ int main (int argc, char ** argv) dbytes+= Grid.StencilSendToRecvFromBegin(requests, (void *)&xbuf[mu+4][0], - xmit_to_rank, + xmit_to_rank,1, (void *)&rbuf[mu+4][0], - recv_from_rank, + recv_from_rank,1, bytes,mu+4); Grid.StencilSendToRecvFromComplete(requests,mu+4); requests.resize(0); @@ -411,8 +411,8 @@ int main (int argc, char ** argv) Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } int tid = omp_get_thread_num(); - tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, - (void *)&rbuf[dir][0], recv_from_rank, bytes,tid); + tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1, + (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid); thread_critical { dbytes+=tbytes; } } diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 7fa37fb6..6896bddf 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -32,18 +32,112 @@ using namespace std; using namespace Grid; -template -struct scal { - d internal; +//////////////////////// +/// Move to domains //// +//////////////////////// + +struct DomainDecomposition +{ + Coordinate Block; + + DomainDecomposition(const Coordinate &_Block): Block(_Block){ assert(Block.size()==Nd);}; + + template + void ProjectDomain(Field &f,Integer domain) + { + GridBase *grid = f.Grid(); + int dims = grid->Nd(); + int isDWF= (dims==Nd+1); + assert((dims==Nd)||(dims==Nd+1)); + + Field zz(grid); zz = Zero(); + LatticeInteger coor(grid); + LatticeInteger domaincoor(grid); + LatticeInteger mask(grid); mask = Integer(1); + LatticeInteger zi(grid); zi = Integer(0); + for(int d=0;d +struct DirichletFilter: public MomentumFilterBase +{ + Coordinate Block; + + DirichletFilter(const Coordinate &_Block): Block(_Block) {} + // Edge detect using domain projectors + void applyFilter (MomentaField &U) const override + { + DomainDecomposition Domains(Block); + GridBase *grid = U.Grid(); + LatticeInteger coor(grid); + LatticeInteger face(grid); + LatticeInteger one(grid); one = 1; + LatticeInteger zero(grid); zero = 0; + LatticeInteger omega(grid); + LatticeInteger omegabar(grid); + LatticeInteger tmp(grid); + + omega=one; Domains.ProjectDomain(omega,0); + omegabar=one; Domains.ProjectDomain(omegabar,1); + + LatticeInteger nface(grid); nface=Zero(); + + MomentaField projected(grid); projected=Zero(); + typedef decltype(PeekIndex(U,0)) MomentaLinkField; + MomentaLinkField Umu(grid); + MomentaLinkField zz(grid); zz=Zero(); + + int dims = grid->Nd(); + Coordinate Global=grid->GlobalDimensions(); + assert(dims==Nd); + + for(int mu=0;mu(U,mu); + + // Upper face + tmp = Cshift(omegabar,mu,1); + tmp = tmp + omega; + face = where(tmp == Integer(2),one,zero ); + + tmp = Cshift(omega,mu,1); + tmp = tmp + omegabar; + face = where(tmp == Integer(2),one,face ); + + Umu = where(face,zz,Umu); + + PokeIndex(U, Umu, mu); + } + } + } +}; + + + +Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT +}; + +void Benchmark(int Ls, std::vector Dirichlet); int main (int argc, char ** argv) { @@ -52,24 +146,48 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); - Coordinate latt4 = GridDefaultLatt(); int Ls=16; - for(int i=0;i> Ls; } - + } + std::vector Dirichlet(5,0); + Benchmark(Ls,Dirichlet); + Coordinate latt4 = GridDefaultLatt(); + Coordinate mpi = GridDefaultMpi(); + Coordinate shm; + GlobalSharedMemory::GetShmDims(mpi,shm); + /* + Dirichlet = std::vector({0, + latt4[0]/mpi[0] * shm[0], + latt4[1]/mpi[1] * shm[1], + latt4[2]/mpi[2] * shm[2], + latt4[3]/mpi[3] * shm[3]}); + */ + Dirichlet = std::vector({0, + latt4[0]/mpi[0] , + latt4[1]/mpi[1] , + latt4[2]/mpi[2] , + latt4[3]/mpi[3] }); + + std::cout << " Dirichlet block "<< Dirichlet<< std::endl; + Benchmark(Ls,Dirichlet); + Grid_finalize(); + exit(0); +} +void Benchmark(int Ls, std::vector Dirichlet) +{ + Coordinate latt4 = GridDefaultLatt(); GridLogLayout(); long unsigned int single_site_flops = 8*Nc*(7+16*Nc); - GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); - std::cout << GridLogMessage << "Making s innermost grids"<::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "Random gauge initialised " << std::endl; -#if 0 - Umu=1.0; - for(int mu=0;mu(Umu,mu); - // if (mu !=2 ) ttmp = 0; - // ttmp = ttmp* pow(10.0,mu); - PokeIndex(Umu,ttmp,mu); - } - std::cout << GridLogMessage << "Forced to diagonal " << std::endl; -#endif + //////////////////////////////////// + // Apply BCs + //////////////////////////////////// + std::cout << GridLogMessage << "Applying BCs " << std::endl; + Coordinate Block(4); + for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1]; + + std::cout << GridLogMessage << "Dirichlet Block " << Block<< std::endl; + DirichletFilter Filter(Block); + Filter.applyFilter(Umu); + //////////////////////////////////// // Naive wilson implementation //////////////////////////////////// @@ -191,11 +296,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); - Dw.ZeroCounters(); Dw.Dhop(src,result,0); std::cout<1.0e-4) ) { - /* - std::cout << "RESULT\n " << result<Barrier(); exit(-1); } assert (norm2(err)< 1.0e-4 ); - Dw.Report(); } if (1) @@ -294,13 +390,14 @@ int main (int argc, char ** argv) std::cout<1.0e-4)){ -/* - std::cout<< "DAG RESULT\n " < 1.0e-4 ) { + std::cout << "Error vector is\n" <Barrier(); Dw.DhopEO(src_o,r_e,DaggerNo); double t0=usecond(); @@ -352,7 +448,6 @@ int main (int argc, char ** argv) std::cout<1.0e-4)){ - /* - std::cout<< "Deo RESULT\n " < develop) uncommited changes +Current Grid git commit hash=da06d15f73184ceb15d66d4e7e702b02fed7b940: (HEAD -> feature/dirichlet, develop) uncommited changes Grid : Message : ================================================ Grid : Message : MPI is initialised and logging filters activated @@ -124,122 +136,102 @@ Grid : Message : ================================================ Grid : Message : Requested 2147483648 byte stencil comms buffers Grid : Message : MemoryManager Cache 34004218675 bytes Grid : Message : MemoryManager::Init() setting up -Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory Grid : Message : MemoryManager::Init() Using cudaMalloc -Grid : Message : 1.198523 s : Grid Layout -Grid : Message : 1.198530 s : Global lattice size : 64 64 64 64 -Grid : Message : 1.198534 s : OpenMP threads : 4 -Grid : Message : 1.198535 s : MPI tasks : 2 2 2 2 -Grid : Message : 1.397615 s : Making s innermost grids -Grid : Message : 1.441828 s : Initialising 4d RNG -Grid : Message : 1.547973 s : Intialising parallel RNG with unique string 'The 4D RNG' -Grid : Message : 1.547998 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 -Grid : Message : 1.954777 s : Initialising 5d RNG -Grid : Message : 3.633825 s : Intialising parallel RNG with unique string 'The 5D RNG' -Grid : Message : 3.633869 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a -Grid : Message : 12.162710 s : Initialised RNGs -Grid : Message : 15.882520 s : Drawing gauge field -Grid : Message : 15.816362 s : Random gauge initialised -Grid : Message : 17.279671 s : Setting up Cshift based reference -Grid : Message : 26.331426 s : ***************************************************************** -Grid : Message : 26.331452 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm -Grid : Message : 26.331454 s : ***************************************************************** -Grid : Message : 26.331456 s : ***************************************************************** -Grid : Message : 26.331458 s : * Benchmarking DomainWallFermionR::Dhop -Grid : Message : 26.331459 s : * Vectorising space-time by 8 -Grid : Message : 26.331463 s : * VComplexF size is 64 B -Grid : Message : 26.331465 s : * SINGLE precision -Grid : Message : 26.331467 s : * Using Overlapped Comms/Compute -Grid : Message : 26.331468 s : * Using GENERIC Nc WilsonKernels -Grid : Message : 26.331469 s : ***************************************************************** -Grid : Message : 28.413717 s : Called warmup -Grid : Message : 56.418423 s : Called Dw 3000 times in 2.80047e+07 us -Grid : Message : 56.418476 s : mflop/s = 3.79581e+07 -Grid : Message : 56.418479 s : mflop/s per rank = 2.37238e+06 -Grid : Message : 56.418481 s : mflop/s per node = 9.48953e+06 -Grid : Message : 56.418483 s : RF GiB/s (base 2) = 77130 -Grid : Message : 56.418485 s : mem GiB/s (base 2) = 48206.3 -Grid : Message : 56.422076 s : norm diff 1.03481e-13 -Grid : Message : 56.456894 s : #### Dhop calls report -Grid : Message : 56.456899 s : WilsonFermion5D Number of DhopEO Calls : 6002 -Grid : Message : 56.456903 s : WilsonFermion5D TotalTime /Calls : 4710.93 us -Grid : Message : 56.456905 s : WilsonFermion5D CommTime /Calls : 3196.15 us -Grid : Message : 56.456908 s : WilsonFermion5D FaceTime /Calls : 494.392 us -Grid : Message : 56.456910 s : WilsonFermion5D ComputeTime1/Calls : 44.4107 us -Grid : Message : 56.456912 s : WilsonFermion5D ComputeTime2/Calls : 1037.75 us -Grid : Message : 56.456921 s : Average mflops/s per call : 3.55691e+09 -Grid : Message : 56.456925 s : Average mflops/s per call per rank : 2.22307e+08 -Grid : Message : 56.456928 s : Average mflops/s per call per node : 8.89228e+08 -Grid : Message : 56.456930 s : Average mflops/s per call (full) : 3.82915e+07 -Grid : Message : 56.456933 s : Average mflops/s per call per rank (full): 2.39322e+06 -Grid : Message : 56.456952 s : Average mflops/s per call per node (full): 9.57287e+06 -Grid : Message : 56.456954 s : WilsonFermion5D Stencil -Grid : Message : 56.457016 s : Stencil calls 3001 -Grid : Message : 56.457022 s : Stencil halogtime 0 -Grid : Message : 56.457024 s : Stencil gathertime 55.9154 -Grid : Message : 56.457026 s : Stencil gathermtime 20.1073 -Grid : Message : 56.457028 s : Stencil mergetime 18.5585 -Grid : Message : 56.457030 s : Stencil decompresstime 0.0639787 -Grid : Message : 56.457032 s : Stencil comms_bytes 4.02653e+08 -Grid : Message : 56.457034 s : Stencil commtime 6379.93 -Grid : Message : 56.457036 s : Stencil 63.1124 GB/s per rank -Grid : Message : 56.457038 s : Stencil 252.45 GB/s per node -Grid : Message : 56.457040 s : WilsonFermion5D StencilEven -Grid : Message : 56.457048 s : WilsonFermion5D StencilOdd -Grid : Message : 56.457062 s : WilsonFermion5D Stencil Reporti() -Grid : Message : 56.457065 s : WilsonFermion5D StencilEven Reporti() -Grid : Message : 56.457066 s : WilsonFermion5D StencilOdd Reporti() -Grid : Message : 79.259261 s : Compare to naive wilson implementation Dag to verify correctness -Grid : Message : 79.259287 s : Called DwDag -Grid : Message : 79.259288 s : norm dag result 12.0421 -Grid : Message : 79.271740 s : norm dag ref 12.0421 -Grid : Message : 79.287759 s : norm dag diff 7.63236e-14 -Grid : Message : 79.328100 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec -Grid : Message : 79.955951 s : src_e0.499997 -Grid : Message : 80.633620 s : src_o0.500003 -Grid : Message : 80.164163 s : ********************************************************* -Grid : Message : 80.164168 s : * Benchmarking DomainWallFermionF::DhopEO -Grid : Message : 80.164170 s : * Vectorising space-time by 8 -Grid : Message : 80.164172 s : * SINGLE precision -Grid : Message : 80.164174 s : * Using Overlapped Comms/Compute -Grid : Message : 80.164177 s : * Using GENERIC Nc WilsonKernels -Grid : Message : 80.164178 s : ********************************************************* -Grid : Message : 93.797635 s : Deo mflop/s = 3.93231e+07 -Grid : Message : 93.797670 s : Deo mflop/s per rank 2.45769e+06 -Grid : Message : 93.797672 s : Deo mflop/s per node 9.83077e+06 -Grid : Message : 93.797674 s : #### Dhop calls report -Grid : Message : 93.797675 s : WilsonFermion5D Number of DhopEO Calls : 3001 -Grid : Message : 93.797677 s : WilsonFermion5D TotalTime /Calls : 4542.83 us -Grid : Message : 93.797679 s : WilsonFermion5D CommTime /Calls : 2978.97 us -Grid : Message : 93.797681 s : WilsonFermion5D FaceTime /Calls : 602.287 us -Grid : Message : 93.797683 s : WilsonFermion5D ComputeTime1/Calls : 67.1416 us -Grid : Message : 93.797685 s : WilsonFermion5D ComputeTime2/Calls : 1004.07 us -Grid : Message : 93.797713 s : Average mflops/s per call : 3.30731e+09 -Grid : Message : 93.797717 s : Average mflops/s per call per rank : 2.06707e+08 -Grid : Message : 93.797719 s : Average mflops/s per call per node : 8.26827e+08 -Grid : Message : 93.797721 s : Average mflops/s per call (full) : 3.97084e+07 -Grid : Message : 93.797727 s : Average mflops/s per call per rank (full): 2.48178e+06 -Grid : Message : 93.797732 s : Average mflops/s per call per node (full): 9.92711e+06 -Grid : Message : 93.797735 s : WilsonFermion5D Stencil -Grid : Message : 93.797746 s : WilsonFermion5D StencilEven -Grid : Message : 93.797758 s : WilsonFermion5D StencilOdd -Grid : Message : 93.797769 s : Stencil calls 3001 -Grid : Message : 93.797773 s : Stencil halogtime 0 -Grid : Message : 93.797776 s : Stencil gathertime 56.7458 -Grid : Message : 93.797780 s : Stencil gathermtime 22.6504 -Grid : Message : 93.797782 s : Stencil mergetime 21.1913 -Grid : Message : 93.797786 s : Stencil decompresstime 0.0556481 -Grid : Message : 93.797788 s : Stencil comms_bytes 2.01327e+08 -Grid : Message : 93.797791 s : Stencil commtime 2989.33 -Grid : Message : 93.797795 s : Stencil 67.3484 GB/s per rank -Grid : Message : 93.797798 s : Stencil 269.394 GB/s per node -Grid : Message : 93.797801 s : WilsonFermion5D Stencil Reporti() -Grid : Message : 93.797803 s : WilsonFermion5D StencilEven Reporti() -Grid : Message : 93.797805 s : WilsonFermion5D StencilOdd Reporti() -Grid : Message : 93.873429 s : r_e6.02111 -Grid : Message : 93.879931 s : r_o6.02102 -Grid : Message : 93.885912 s : res12.0421 -Grid : Message : 94.876555 s : norm diff 0 -Grid : Message : 95.485643 s : norm diff even 0 -Grid : Message : 95.581236 s : norm diff odd 0 +Grid : Message : 1.875883 s : Grid Layout +Grid : Message : 1.875893 s : Global lattice size : 64 64 64 64 +Grid : Message : 1.875897 s : OpenMP threads : 4 +Grid : Message : 1.875898 s : MPI tasks : 2 2 2 2 +Grid : Message : 1.993571 s : Initialising 4d RNG +Grid : Message : 2.881990 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 2.882370 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 2.495044 s : Initialising 5d RNG +Grid : Message : 4.120900 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 4.121350 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 15.268010 s : Drawing gauge field +Grid : Message : 16.234025 s : Random gauge initialised +Grid : Message : 16.234057 s : Applying BCs +Grid : Message : 16.365565 s : Setting up Cshift based reference +Grid : Message : 44.512418 s : ***************************************************************** +Grid : Message : 44.512448 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 44.512450 s : ***************************************************************** +Grid : Message : 44.512451 s : ***************************************************************** +Grid : Message : 44.512452 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 44.512453 s : * Vectorising space-time by 8 +Grid : Message : 44.512454 s : * VComplexF size is 64 B +Grid : Message : 44.512456 s : * SINGLE precision +Grid : Message : 44.512459 s : * Using Overlapped Comms/Compute +Grid : Message : 44.512460 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 44.512461 s : ***************************************************************** +Grid : Message : 46.389070 s : Called warmup +Grid : Message : 49.211265 s : Called Dw 300 times in 2.82203e+06 us +Grid : Message : 49.211295 s : mflop/s = 3.76681e+07 +Grid : Message : 49.211297 s : mflop/s per rank = 2.35425e+06 +Grid : Message : 49.211299 s : mflop/s per node = 9.41702e+06 +Grid : Message : 49.211301 s : RF GiB/s (base 2) = 76540.6 +Grid : Message : 49.211308 s : mem GiB/s (base 2) = 47837.9 +Grid : Message : 49.214868 s : norm diff 1.06409e-13 +Grid : Message : 92.647781 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 92.647816 s : Called DwDag +Grid : Message : 92.647817 s : norm dag result 12.0421 +Grid : Message : 92.801806 s : norm dag ref 12.0421 +Grid : Message : 92.817724 s : norm dag diff 7.21921e-14 +Grid : Message : 92.858973 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 93.210378 s : src_e0.499997 +Grid : Message : 93.583286 s : src_o0.500003 +Grid : Message : 93.682468 s : ********************************************************* +Grid : Message : 93.682471 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 93.682472 s : * Vectorising space-time by 8 +Grid : Message : 93.682473 s : * SINGLE precision +Grid : Message : 93.682475 s : * Using Overlapped Comms/Compute +Grid : Message : 93.682476 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 93.682477 s : ********************************************************* +Grid : Message : 95.162342 s : Deo mflop/s = 3.92487e+07 +Grid : Message : 95.162387 s : Deo mflop/s per rank 2.45305e+06 +Grid : Message : 95.162389 s : Deo mflop/s per node 9.81219e+06 +Grid : Message : 95.232801 s : r_e6.02111 +Grid : Message : 95.240061 s : r_o6.02102 +Grid : Message : 95.245975 s : res12.0421 +Grid : Message : 95.833402 s : norm diff 0 +Grid : Message : 96.573829 s : norm diff even 0 +Grid : Message : 96.868272 s : norm diff odd 0 + Dirichlet block [0 64 64 32 32] +Grid : Message : 97.756909 s : Grid Layout +Grid : Message : 97.756911 s : Global lattice size : 64 64 64 64 +Grid : Message : 97.756921 s : OpenMP threads : 4 +Grid : Message : 97.756922 s : MPI tasks : 2 2 2 2 +Grid : Message : 97.897085 s : Initialising 4d RNG +Grid : Message : 97.965061 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 97.965097 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 98.367431 s : Initialising 5d RNG +Grid : Message : 99.752745 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 99.752790 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 111.290148 s : Drawing gauge field +Grid : Message : 112.349289 s : Random gauge initialised +Grid : Message : 112.349320 s : Applying BCs +Grid : Message : 113.948740 s : Setting up Cshift based reference +Grid : Message : 140.320415 s : ***************************************************************** +Grid : Message : 140.320443 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 140.320444 s : ***************************************************************** +Grid : Message : 140.320445 s : ***************************************************************** +Grid : Message : 140.320446 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 140.320447 s : * Vectorising space-time by 8 +Grid : Message : 140.320448 s : * VComplexF size is 64 B +Grid : Message : 140.320450 s : * SINGLE precision +Grid : Message : 140.320451 s : * Using Overlapped Comms/Compute +Grid : Message : 140.320452 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 140.320453 s : ***************************************************************** +Grid : Message : 142.296150 s : Called warmup +Grid : Message : 144.397678 s : Called Dw 300 times in 2.36719e+06 us +Grid : Message : 144.397700 s : mflop/s = 4.49058e+07 +Grid : Message : 144.397702 s : mflop/s per rank = 2.80661e+06 +Grid : Message : 144.397704 s : mflop/s per node = 1.12265e+07 +Grid : Message : 144.397706 s : RF GiB/s (base 2) = 91247.6 +Grid : Message : 144.397708 s : mem GiB/s (base 2) = 57029.7 +Grid : Message : 144.401269 s : norm diff 9.78944e-14 +Grid : Message : 186.885460 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 186.885492 s : Called DwDag +Grid : Message : 186.885493 s : norm dag result 10.4157 +Grid : Message : 186.897154 s : norm dag ref 11.2266 +Grid : Message : 186.912538 s : norm dag diff 0.484633 diff --git a/systems/Tursa/dwf4.slurm b/systems/Tursa/dwf4.slurm index 65191398..5940ac05 100644 --- a/systems/Tursa/dwf4.slurm +++ b/systems/Tursa/dwf4.slurm @@ -1,14 +1,13 @@ #!/bin/bash #SBATCH -J dslash -#SBATCH -A tc002 -#SBATCH -t 2:20:00 -#SBATCH --nodelist=tu-c0r0n[00,03,06,09] +#SBATCH -A dp207 #SBATCH --exclusive #SBATCH --nodes=4 #SBATCH --ntasks=16 +#SBATCH --qos=standard #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=8 -#SBATCH --time=12:00:00 +#SBATCH --time=0:05:00 #SBATCH --partition=gpu #SBATCH --gres=gpu:4 #SBATCH --output=%x.%j.out From 70988e43d23c6fc4c86b482d56a56c9efd6e6912 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Feb 2022 01:42:14 -0500 Subject: [PATCH 006/240] Passes multinode dirichlet test with boundaries at node boundary or at the single rank boundary --- benchmarks/Benchmark_dwf_fp32.cc | 196 ++++++++++++++----------------- systems/Spock/sourceme.sh | 2 +- 2 files changed, 91 insertions(+), 107 deletions(-) diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 6896bddf..5a64aaa8 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -36,100 +36,41 @@ using namespace Grid; /// Move to domains //// //////////////////////// -struct DomainDecomposition -{ - Coordinate Block; - - DomainDecomposition(const Coordinate &_Block): Block(_Block){ assert(Block.size()==Nd);}; - - template - void ProjectDomain(Field &f,Integer domain) - { - GridBase *grid = f.Grid(); - int dims = grid->Nd(); - int isDWF= (dims==Nd+1); - assert((dims==Nd)||(dims==Nd+1)); - - Field zz(grid); zz = Zero(); - LatticeInteger coor(grid); - LatticeInteger domaincoor(grid); - LatticeInteger mask(grid); mask = Integer(1); - LatticeInteger zi(grid); zi = Integer(0); - for(int d=0;d struct DirichletFilter: public MomentumFilterBase { + typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type + typedef typename MomentaField::scalar_type scalar_type; //scalar complex type + + typedef iScalar > > ScalarType; //complex phase for each site + Coordinate Block; - DirichletFilter(const Coordinate &_Block): Block(_Block) {} + DirichletFilter(const Coordinate &_Block): Block(_Block){} - // Edge detect using domain projectors - void applyFilter (MomentaField &U) const override + void applyFilter(MomentaField &P) const override { - DomainDecomposition Domains(Block); - GridBase *grid = U.Grid(); - LatticeInteger coor(grid); - LatticeInteger face(grid); - LatticeInteger one(grid); one = 1; - LatticeInteger zero(grid); zero = 0; - LatticeInteger omega(grid); - LatticeInteger omegabar(grid); - LatticeInteger tmp(grid); + GridBase *grid = P.Grid(); + typedef decltype(PeekIndex(P, 0)) LatCM; + //////////////////////////////////////////////////// + // Zero strictly links crossing between domains + //////////////////////////////////////////////////// + LatticeInteger coor(grid); + LatCM zz(grid); zz = Zero(); + for(int mu=0;mu(U,0)) MomentaLinkField; - MomentaLinkField Umu(grid); - MomentaLinkField zz(grid); zz=Zero(); - - int dims = grid->Nd(); - Coordinate Global=grid->GlobalDimensions(); - assert(dims==Nd); - - for(int mu=0;mu(U,mu); - - // Upper face - tmp = Cshift(omegabar,mu,1); - tmp = tmp + omega; - face = where(tmp == Integer(2),one,zero ); - - tmp = Cshift(omega,mu,1); - tmp = tmp + omegabar; - face = where(tmp == Integer(2),one,face ); - - Umu = where(face,zz,Umu); - - PokeIndex(U, Umu, mu); + if ( Block[mu] ) { + // If costly could provide Grid earlier and precompute masks + LatticeCoordinate(coor,mu); + auto P_mu = PeekIndex(P, mu); + P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu); + PokeIndex(P, P_mu, mu); } } } }; - Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -152,27 +93,57 @@ int main (int argc, char ** argv) std::stringstream ss(argv[i+1]); ss >> Ls; } } + + ////////////////// + // With comms + ////////////////// std::vector Dirichlet(5,0); + + std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" < CommDim(Nd); Coordinate shm; GlobalSharedMemory::GetShmDims(mpi,shm); - /* + + + ////////////////////// + // Node level + ////////////////////// + std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; Dirichlet = std::vector({0, - latt4[0]/mpi[0] * shm[0], - latt4[1]/mpi[1] * shm[1], - latt4[2]/mpi[2] * shm[2], - latt4[3]/mpi[3] * shm[3]}); - */ - Dirichlet = std::vector({0, - latt4[0]/mpi[0] , - latt4[1]/mpi[1] , - latt4[2]/mpi[2] , - latt4[3]/mpi[3] }); - - std::cout << " Dirichlet block "<< Dirichlet<< std::endl; + CommDim[0]*latt4[0]/mpi[0] * shm[0], + CommDim[1]*latt4[1]/mpi[1] * shm[1], + CommDim[2]*latt4[2]/mpi[2] * shm[2], + CommDim[3]*latt4[3]/mpi[3] * shm[3]}); + Benchmark(Ls,Dirichlet); + + std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; + Dirichlet = std::vector({0, + CommDim[0]*latt4[0]/mpi[0], + CommDim[1]*latt4[1]/mpi[1], + CommDim[2]*latt4[2]/mpi[2], + CommDim[3]*latt4[3]/mpi[3]}); + + Benchmark(Ls,Dirichlet); + Grid_finalize(); exit(0); } @@ -203,8 +174,20 @@ void Benchmark(int Ls, std::vector Dirichlet) GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); LatticeFermionF src (FGrid); random(RNG5,src); +#if 1 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp< Dirichlet) //////////////////////////////////// // Apply BCs //////////////////////////////////// - std::cout << GridLogMessage << "Applying BCs " << std::endl; Coordinate Block(4); for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1]; - std::cout << GridLogMessage << "Dirichlet Block " << Block<< std::endl; + std::cout << GridLogMessage << "Applying BCs for Dirichlet Block " << Block << std::endl; + DirichletFilter Filter(Block); Filter.applyFilter(Umu); //////////////////////////////////// // Naive wilson implementation //////////////////////////////////// - // replicate across fifth dimension - // LatticeGaugeFieldF Umu5d(FGrid); std::vector U(4,UGrid); for(int mu=0;mu(Umu,mu); } + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; if (1) @@ -297,6 +279,7 @@ void Benchmark(int Ls, std::vector Dirichlet) DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); Dw.DirichletBlock(Dirichlet); + int ncall =300; if (1) { @@ -328,8 +311,8 @@ void Benchmark(int Ls, std::vector Dirichlet) std::cout< Dirichlet) } ref = -0.5*ref; } - // dump=1; - Dw.Dhop(src,result,1); + + Dw.Dhop(src,result,DaggerYes); + + std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl; std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl; + std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl; + std::cout< 1.0e-4 ) { - std::cout << "Error vector is\n" <1.0e-4) { + std::cout << err << std::endl; } assert((norm2(err)<1.0e-4)); diff --git a/systems/Spock/sourceme.sh b/systems/Spock/sourceme.sh index 40d864b5..72a2ff4e 100644 --- a/systems/Spock/sourceme.sh +++ b/systems/Spock/sourceme.sh @@ -1,5 +1,5 @@ module load PrgEnv-gnu -module load rocm/4.3.0 +module load rocm/4.5.0 module load gmp module load cray-fftw module load craype-accel-amd-gfx908 From 0f1c5b08a1ca3dfab58ab995f3be41f13e09a601 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Feb 2022 19:29:28 -0500 Subject: [PATCH 007/240] Dirichlet filters running on AMD and now integrated in Fermion op --- Grid/qcd/action/ActionCore.h | 3 + Grid/qcd/action/fermion/FermionOperator.h | 2 + Grid/qcd/action/fermion/WilsonFermion5D.h | 18 ++++- .../WilsonFermion5DImplementation.h | 11 ++- Grid/qcd/action/filters/DirichletFilter.h | 71 +++++++++++++++++ .../filters}/MomentumFilter.h | 0 Grid/qcd/hmc/integrators/Integrator.h | 1 - Grid/stencil/Stencil.h | 2 +- benchmarks/Benchmark_dwf_fp32.cc | 78 ++++++------------- 9 files changed, 125 insertions(+), 61 deletions(-) create mode 100644 Grid/qcd/action/filters/DirichletFilter.h rename Grid/qcd/{hmc/integrators => action/filters}/MomentumFilter.h (100%) diff --git a/Grid/qcd/action/ActionCore.h b/Grid/qcd/action/ActionCore.h index 6544318d..0ea18d31 100644 --- a/Grid/qcd/action/ActionCore.h +++ b/Grid/qcd/action/ActionCore.h @@ -37,6 +37,9 @@ NAMESPACE_CHECK(ActionSet); #include NAMESPACE_CHECK(ActionParams); +#include +#include + //////////////////////////////////////////// // Gauge Actions //////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/FermionOperator.h b/Grid/qcd/action/fermion/FermionOperator.h index 570e350d..0c159300 100644 --- a/Grid/qcd/action/fermion/FermionOperator.h +++ b/Grid/qcd/action/fermion/FermionOperator.h @@ -49,6 +49,8 @@ public: virtual FermionField &tmp(void) = 0; + virtual void DirichletBlock(Coordinate & _Block) { assert(0); }; + GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); }; diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index affdae10..91abf86a 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -75,6 +75,10 @@ public: FermionField _tmp; FermionField &tmp(void) { return _tmp; } + int Dirichlet; + Coordinate Block; + + /********** Deprecate timers **********/ void Report(void); void ZeroCounters(void); double DhopCalls; @@ -174,10 +178,16 @@ public: GridRedBlackCartesian &FourDimRedBlackGrid, double _M5,const ImplParams &p= ImplParams()); - void DirichletBlock(std::vector & block){ - Stencil.DirichletBlock(block); - StencilEven.DirichletBlock(block); - StencilOdd.DirichletBlock(block); + virtual void DirichletBlock(Coordinate & block) + { + assert(block.size()==Nd+1); + if ( block[0] || block[1] || block[2] || block[3] || block[4] ){ + Dirichlet = 1; + Block = block; + Stencil.DirichletBlock(block); + StencilEven.DirichletBlock(block); + StencilOdd.DirichletBlock(block); + } } // Constructors /* diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 2cc308cc..7775ad9d 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -60,7 +60,8 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, UmuOdd (_FourDimRedBlackGrid), Lebesgue(_FourDimGrid), LebesgueEvenOdd(_FourDimRedBlackGrid), - _tmp(&FiveDimRedBlackGrid) + _tmp(&FiveDimRedBlackGrid), + Dirichlet(0) { // some assertions assert(FiveDimGrid._ndimension==5); @@ -218,6 +219,14 @@ void WilsonFermion5D::ImportGauge(const GaugeField &_Umu) { GaugeField HUmu(_Umu.Grid()); HUmu = _Umu*(-0.5); + if ( Dirichlet ) { + std::cout << GridLogMessage << " Dirichlet BCs 5d " < Filter(GaugeBlock); + Filter.applyFilter(HUmu); + } Impl::DoubleStore(GaugeGrid(),Umu,HUmu); pickCheckerboard(Even,UmuEven,Umu); pickCheckerboard(Odd ,UmuOdd,Umu); diff --git a/Grid/qcd/action/filters/DirichletFilter.h b/Grid/qcd/action/filters/DirichletFilter.h new file mode 100644 index 00000000..95353dab --- /dev/null +++ b/Grid/qcd/action/filters/DirichletFilter.h @@ -0,0 +1,71 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h + +Copyright (C) 2015 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +//-------------------------------------------------------------------- +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +struct DirichletFilter: public MomentumFilterBase +{ + typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type + typedef typename MomentaField::scalar_type scalar_type; //scalar complex type + + typedef iScalar > > ScalarType; //complex phase for each site + + Coordinate Block; + + DirichletFilter(const Coordinate &_Block): Block(_Block){} + + void applyFilter(MomentaField &P) const override + { + GridBase *grid = P.Grid(); + typedef decltype(PeekIndex(P, 0)) LatCM; + //////////////////////////////////////////////////// + // Zero strictly links crossing between domains + //////////////////////////////////////////////////// + LatticeInteger coor(grid); + LatCM zz(grid); zz = Zero(); + for(int mu=0;muGlobalDimensions()[mu] ) ) { + // If costly could provide Grid earlier and precompute masks + std::cout << " Dirichlet in mu="<(P, mu); + P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu); + PokeIndex(P, P_mu, mu); + } + } + } +}; + + + +NAMESPACE_END(Grid); + diff --git a/Grid/qcd/hmc/integrators/MomentumFilter.h b/Grid/qcd/action/filters/MomentumFilter.h similarity index 100% rename from Grid/qcd/hmc/integrators/MomentumFilter.h rename to Grid/qcd/action/filters/MomentumFilter.h diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index aa28c6c8..6b0e3caf 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -33,7 +33,6 @@ directory #define INTEGRATOR_INCLUDED #include -#include "MomentumFilter.h" NAMESPACE_BEGIN(Grid); diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 930957d8..e2430234 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -648,7 +648,7 @@ public: } } /// Introduce a block structure and switch off comms on boundaries - void DirichletBlock(const std::vector &dirichlet_block) + void DirichletBlock(const Coordinate &dirichlet_block) { this->_dirichlet = 1; for(int ii=0;ii_npoints;ii++){ diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 5a64aaa8..e5cc2e63 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -36,41 +36,6 @@ using namespace Grid; /// Move to domains //// //////////////////////// -template -struct DirichletFilter: public MomentumFilterBase -{ - typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type - typedef typename MomentaField::scalar_type scalar_type; //scalar complex type - - typedef iScalar > > ScalarType; //complex phase for each site - - Coordinate Block; - - DirichletFilter(const Coordinate &_Block): Block(_Block){} - - void applyFilter(MomentaField &P) const override - { - GridBase *grid = P.Grid(); - typedef decltype(PeekIndex(P, 0)) LatCM; - //////////////////////////////////////////////////// - // Zero strictly links crossing between domains - //////////////////////////////////////////////////// - LatticeInteger coor(grid); - LatCM zz(grid); zz = Zero(); - for(int mu=0;mu(P, mu); - P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu); - PokeIndex(P, P_mu, mu); - } - } - } -}; - - Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -78,7 +43,7 @@ Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaT }; -void Benchmark(int Ls, std::vector Dirichlet); +void Benchmark(int Ls, Coordinate Dirichlet); int main (int argc, char ** argv) { @@ -97,8 +62,9 @@ int main (int argc, char ** argv) ////////////////// // With comms ////////////////// - std::vector Dirichlet(5,0); - + Coordinate Dirichlet(Nd+1,0); + + std::cout << "\n\n\n\n\n\n" < CommDim(Nd); + Coordinate CommDim(Nd); Coordinate shm; GlobalSharedMemory::GetShmDims(mpi,shm); @@ -118,36 +84,39 @@ int main (int argc, char ** argv) ////////////////////// // Node level ////////////////////// + std::cout << "\n\n\n\n\n\n" <1 ? 1 : 0; - Dirichlet = std::vector({0, - CommDim[0]*latt4[0]/mpi[0] * shm[0], - CommDim[1]*latt4[1]/mpi[1] * shm[1], - CommDim[2]*latt4[2]/mpi[2] * shm[2], - CommDim[3]*latt4[3]/mpi[3] * shm[3]}); + Dirichlet[0] = 0; + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; Benchmark(Ls,Dirichlet); + std::cout << "\n\n\n\n\n\n" <1 ? 1 : 0; - Dirichlet = std::vector({0, - CommDim[0]*latt4[0]/mpi[0], - CommDim[1]*latt4[1]/mpi[1], - CommDim[2]*latt4[2]/mpi[2], - CommDim[3]*latt4[3]/mpi[3]}); + Dirichlet[0] = 0; + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; Benchmark(Ls,Dirichlet); Grid_finalize(); exit(0); } -void Benchmark(int Ls, std::vector Dirichlet) +void Benchmark(int Ls, Coordinate Dirichlet) { Coordinate latt4 = GridDefaultLatt(); GridLogLayout(); @@ -196,7 +165,9 @@ void Benchmark(int Ls, std::vector Dirichlet) std::cout << GridLogMessage << "Drawing gauge field" << std::endl; LatticeGaugeFieldF Umu(UGrid); + LatticeGaugeFieldF UmuCopy(UGrid); SU::HotConfiguration(RNG4,Umu); + UmuCopy=Umu; std::cout << GridLogMessage << "Random gauge initialised " << std::endl; //////////////////////////////////// @@ -205,7 +176,8 @@ void Benchmark(int Ls, std::vector Dirichlet) Coordinate Block(4); for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1]; - std::cout << GridLogMessage << "Applying BCs for Dirichlet Block " << Block << std::endl; + std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl; + std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl; DirichletFilter Filter(Block); Filter.applyFilter(Umu); @@ -279,6 +251,7 @@ void Benchmark(int Ls, std::vector Dirichlet) DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); Dw.DirichletBlock(Dirichlet); + Dw.ImportGauge(Umu); int ncall =300; @@ -377,9 +350,6 @@ void Benchmark(int Ls, std::vector Dirichlet) std::cout<1.0e-4) { - std::cout << err << std::endl; - } assert((norm2(err)<1.0e-4)); LatticeFermionF src_e (FrbGrid); From 5340e50427c088a29d60557714e01a7cabf9fa19 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Mar 2022 17:10:25 -0500 Subject: [PATCH 008/240] HMC running with new formulation --- Grid/lattice/Lattice.h | 1 + Grid/lattice/Lattice_crc.h | 55 +++++++++++++ Grid/log/Log.cc | 5 +- Grid/log/Log.h | 1 + Grid/qcd/action/ActionBase.h | 23 ++++++ Grid/qcd/action/ActionCore.h | 1 + Grid/qcd/hmc/GenericHMCrunner.h | 35 ++++++--- Grid/qcd/hmc/HMC.h | 106 ++++++++++++++++++-------- Grid/qcd/hmc/HMCResourceManager.h | 22 +++++- Grid/qcd/hmc/integrators/Integrator.h | 104 +++++++++++++++++++++++-- Grid/stencil/Stencil.h | 12 ++- Grid/tensors/Tensor_exp.h | 2 +- Grid/threads/Accelerator.h | 4 +- benchmarks/Benchmark_dwf_fp32.cc | 2 +- systems/Spock/config-command | 2 + systems/Spock/dwf8.slurm | 12 ++- systems/Spock/sourceme.sh | 4 + 17 files changed, 324 insertions(+), 67 deletions(-) create mode 100644 Grid/lattice/Lattice_crc.h diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 9f5f1da7..c4adf86a 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -46,3 +46,4 @@ Author: Peter Boyle #include #include #include +#include diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h new file mode 100644 index 00000000..142e2349 --- /dev/null +++ b/Grid/lattice/Lattice_crc.h @@ -0,0 +1,55 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/lattice/Lattice_crc.h + + Copyright (C) 2021 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +template void DumpSliceNorm(std::string s,Lattice &f,int mu=-1) +{ + auto ff = localNorm2(f); + if ( mu==-1 ) mu = f.Grid()->Nd()-1; + typedef typename vobj::tensor_reduced normtype; + typedef typename normtype::scalar_object scalar; + std::vector sff; + sliceSum(ff,sff,mu); + for(int t=0;t uint32_t crc(Lattice & buf) +{ + autoView( buf_v , buf, CpuRead); + return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites()); +} + +#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "< &logstreams) { GridLogError.Active(0); @@ -79,6 +80,7 @@ void GridLogConfigure(std::vector &logstreams) { GridLogPerformance.Active(0); GridLogIntegrator.Active(1); GridLogColours.Active(0); + GridLogHMC.Active(1); for (int i = 0; i < logstreams.size(); i++) { if (logstreams[i] == std::string("Error")) GridLogError.Active(1); @@ -87,7 +89,8 @@ void GridLogConfigure(std::vector &logstreams) { if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); - if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1); + if (logstreams[i] == std::string("NoIntegrator")) GridLogIntegrator.Active(0); + if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); } } diff --git a/Grid/log/Log.h b/Grid/log/Log.h index 68693647..b1696fee 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -182,6 +182,7 @@ extern GridLogger GridLogDebug ; extern GridLogger GridLogPerformance; extern GridLogger GridLogIterative ; extern GridLogger GridLogIntegrator ; +extern GridLogger GridLogHMC; extern Colours GridLogColours; std::string demangle(const char* name) ; diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index 17980ee0..fa69b4e5 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -40,6 +40,29 @@ class Action public: bool is_smeared = false; + RealD deriv_norm_sum; + RealD deriv_max_sum; + int deriv_num; + RealD deriv_us; + RealD S_us; + RealD refresh_us; + void reset_timer(void) { + deriv_us = S_us = refresh_us = 0.0; + deriv_num=0; + deriv_norm_sum = deriv_max_sum=0.0; + } + void deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;} + RealD deriv_max_average(void) { return deriv_max_sum/deriv_num; }; + RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; }; + RealD deriv_timer(void) { return deriv_us; }; + RealD S_timer(void) { return deriv_us; }; + RealD refresh_timer(void) { return deriv_us; }; + void deriv_timer_start(void) { deriv_us-=usecond(); } + void deriv_timer_stop(void) { deriv_us+=usecond(); } + void refresh_timer_start(void) { refresh_us-=usecond(); } + void refresh_timer_stop(void) { refresh_us+=usecond(); } + void S_timer_start(void) { S_us-=usecond(); } + void S_timer_stop(void) { S_us+=usecond(); } // Heatbath? virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions virtual RealD S(const GaugeField& U) = 0; // evaluate the action diff --git a/Grid/qcd/action/ActionCore.h b/Grid/qcd/action/ActionCore.h index 0ea18d31..eb77236a 100644 --- a/Grid/qcd/action/ActionCore.h +++ b/Grid/qcd/action/ActionCore.h @@ -39,6 +39,7 @@ NAMESPACE_CHECK(ActionParams); #include #include +#include //////////////////////////////////////////// // Gauge Actions diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h index 98e8175a..098f8f22 100644 --- a/Grid/qcd/hmc/GenericHMCrunner.h +++ b/Grid/qcd/hmc/GenericHMCrunner.h @@ -129,18 +129,10 @@ public: Runner(S); } - ////////////////////////////////////////////////////////////////// - -private: - template - void Runner(SmearingPolicy &Smearing) { - auto UGrid = Resources.GetCartesian(); - Resources.AddRNGs(); - Field U(UGrid); - - // Can move this outside? - typedef IntegratorType TheIntegrator; - TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing); + //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U. + //This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments + void initializeGaugeFieldAndRNGs(Field &U){ + if(!Resources.haveRNGs()) Resources.AddRNGs(); if (Parameters.StartingType == "HotStart") { // Hot start @@ -167,6 +159,25 @@ private: << "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n"; exit(1); } + } + + + + ////////////////////////////////////////////////////////////////// + +private: + template + void Runner(SmearingPolicy &Smearing) { + auto UGrid = Resources.GetCartesian(); + Field U(UGrid); + + initializeGaugeFieldAndRNGs(U); + + typedef IntegratorType TheIntegrator; + TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing); + + // Sets the momentum filter + MDynamics.setMomentumFilter(*(Resources.GetMomentumFilter())); Smearing.set_Field(U); diff --git a/Grid/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h index 44674ea5..86796f05 100644 --- a/Grid/qcd/hmc/HMC.h +++ b/Grid/qcd/hmc/HMC.h @@ -34,6 +34,7 @@ directory * @brief Classes for Hybrid Monte Carlo update * * @author Guido Cossu + * @author Peter Boyle */ //-------------------------------------------------------------------- #pragma once @@ -115,22 +116,17 @@ private: random(sRNG, rn_test); - std::cout << GridLogMessage - << "--------------------------------------------------\n"; - std::cout << GridLogMessage << "exp(-dH) = " << prob - << " Random = " << rn_test << "\n"; - std::cout << GridLogMessage - << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n"; + std::cout << GridLogHMC << "--------------------------------------------------\n"; + std::cout << GridLogHMC << "exp(-dH) = " << prob << " Random = " << rn_test << "\n"; + std::cout << GridLogHMC << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n"; if ((prob > 1.0) || (rn_test <= prob)) { // accepted - std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n"; - std::cout << GridLogMessage - << "--------------------------------------------------\n"; + std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n"; + std::cout << GridLogHMC << "--------------------------------------------------\n"; return true; } else { // rejected - std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n"; - std::cout << GridLogMessage - << "--------------------------------------------------\n"; + std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n"; + std::cout << GridLogHMC << "--------------------------------------------------\n"; return false; } } @@ -139,19 +135,68 @@ private: // Evolution ///////////////////////////////////////////////////////// RealD evolve_hmc_step(Field &U) { - TheIntegrator.refresh(U, sRNG, pRNG); // set U and initialize P and phi's - RealD H0 = TheIntegrator.S(U); // initial state action + GridBase *Grid = U.Grid(); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Mainly for DDHMC perform a random translation of U modulo volume + ////////////////////////////////////////////////////////////////////////////////////////////////////// + std::cout << GridLogMessage << "--------------------------------------------------\n"; + std::cout << GridLogMessage << "Random shifting gauge field by ["; + for(int d=0;dNd();d++) { + + int L = Grid->GlobalDimensions()[d]; + + RealD rn_uniform; random(sRNG, rn_uniform); + + int shift = (int) (rn_uniform*L); + + std::cout << shift; + if(dNd()-1) std::cout <<","; + else std::cout <<"]\n"; + + U = Cshift(U,d,shift); + } + std::cout << GridLogMessage << "--------------------------------------------------\n"; + + TheIntegrator.reset_timer(); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // set U and initialize P and phi's + ////////////////////////////////////////////////////////////////////////////////////////////////////// + std::cout << GridLogMessage << "--------------------------------------------------\n"; + std::cout << GridLogMessage << "Refresh momenta and pseudofermions"; + TheIntegrator.refresh(U, sRNG, pRNG); + std::cout << GridLogMessage << "--------------------------------------------------\n"; + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // initial state action + ////////////////////////////////////////////////////////////////////////////////////////////////////// + std::cout << GridLogMessage << "--------------------------------------------------\n"; + std::cout << GridLogMessage << "Compute initial action"; + RealD H0 = TheIntegrator.S(U); + std::cout << GridLogMessage << "--------------------------------------------------\n"; std::streamsize current_precision = std::cout.precision(); std::cout.precision(15); - std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n"; + std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n"; std::cout.precision(current_precision); + std::cout << GridLogMessage << "--------------------------------------------------\n"; + std::cout << GridLogMessage << " Molecular Dynamics evolution "; TheIntegrator.integrate(U); + std::cout << GridLogMessage << "--------------------------------------------------\n"; - RealD H1 = TheIntegrator.S(U); // updated state action + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // updated state action + ////////////////////////////////////////////////////////////////////////////////////////////////////// + std::cout << GridLogMessage << "--------------------------------------------------\n"; + std::cout << GridLogMessage << "Compute final action"; + RealD H1 = TheIntegrator.S(U); + std::cout << GridLogMessage << "--------------------------------------------------\n"; + + /////////////////////////////////////////////////////////// if(0){ std::cout << "------------------------- Reversibility test" << std::endl; @@ -163,17 +208,16 @@ private: } /////////////////////////////////////////////////////////// - std::cout.precision(15); - std::cout << GridLogMessage << "Total H after trajectory = " << H1 - << " dH = " << H1 - H0 << "\n"; + + std::cout << GridLogHMC << "--------------------------------------------------\n"; + std::cout << GridLogHMC << "Total H after trajectory = " << H1 << " dH = " << H1 - H0 << "\n"; + std::cout << GridLogHMC << "--------------------------------------------------\n"; + std::cout.precision(current_precision); return (H1 - H0); } - - - public: ///////////////////////////////////////// @@ -195,10 +239,13 @@ public: // Actual updates (evolve a copy Ucopy then copy back eventually) unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory; + for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) { - std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n"; + + std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n"; + if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) { - std::cout << GridLogMessage << "-- Thermalization" << std::endl; + std::cout << GridLogHMC << "-- Thermalization" << std::endl; } double t0=usecond(); @@ -207,20 +254,19 @@ public: DeltaH = evolve_hmc_step(Ucopy); // Metropolis-Hastings test bool accept = true; - if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) { + if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) { accept = metropolis_test(DeltaH); } else { - std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl; + std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl; } if (accept) Ucur = Ucopy; - - double t1=usecond(); - std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl; + std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl; + TheIntegrator.print_timer(); for (int obs = 0; obs < Observables.size(); obs++) { std::cout << GridLogDebug << "Observables # " << obs << std::endl; @@ -228,7 +274,7 @@ public: std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl; Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG); } - std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl; + std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl; } } diff --git a/Grid/qcd/hmc/HMCResourceManager.h b/Grid/qcd/hmc/HMCResourceManager.h index 783e4890..19bee923 100644 --- a/Grid/qcd/hmc/HMCResourceManager.h +++ b/Grid/qcd/hmc/HMCResourceManager.h @@ -72,6 +72,8 @@ class HMCResourceManager { typedef HMCModuleBase< BaseHmcCheckpointer > CheckpointerBaseModule; typedef HMCModuleBase< HmcObservable > ObservableBaseModule; typedef ActionModuleBase< Action, GridModule > ActionBaseModule; + typedef typename ImplementationPolicy::Field MomentaField; + typedef typename ImplementationPolicy::Field Field; // Named storage for grid pairs (std + red-black) std::unordered_map Grids; @@ -80,6 +82,9 @@ class HMCResourceManager { // SmearingModule Smearing; std::unique_ptr CP; + // Momentum filter + std::unique_ptr > Filter; + // A vector of HmcObservable modules std::vector > ObservablesList; @@ -90,6 +95,7 @@ class HMCResourceManager { bool have_RNG; bool have_CheckPointer; + bool have_Filter; // NOTE: operator << is not overloaded for std::vector // so this function is necessary @@ -101,7 +107,7 @@ class HMCResourceManager { public: - HMCResourceManager() : have_RNG(false), have_CheckPointer(false) {} + HMCResourceManager() : have_RNG(false), have_CheckPointer(false), have_Filter(false) {} template void initialize(ReaderClass &Read){ @@ -129,6 +135,7 @@ public: RNGModuleParameters RNGpar(Read); SetRNGSeeds(RNGpar); + // Observables auto &ObsFactory = HMC_ObservablesModuleFactory::getInstance(); Read.push(observable_string);// here must check if existing... @@ -208,6 +215,16 @@ public: AddGrid(s, Mod); } + void SetMomentumFilter( MomentumFilterBase * MomFilter) { + assert(have_Filter==false); + Filter = std::unique_ptr >(MomFilter); + have_Filter = true; + } + MomentumFilterBase *GetMomentumFilter(void) { + if ( !have_Filter) + SetMomentumFilter(new MomentumFilterNone()); + return Filter.get(); + } GridCartesian* GetCartesian(std::string s = "") { if (s.empty()) s = Grids.begin()->first; @@ -226,6 +243,9 @@ public: ////////////////////////////////////////////////////// // Random number generators ////////////////////////////////////////////////////// + + //Return true if the RNG objects have been instantiated + bool haveRNGs() const{ return have_RNG; } void AddRNGs(std::string s = "") { // Couple the RNGs to the GridModule tagged by s diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 6b0e3caf..1985caf0 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -66,6 +66,7 @@ public: template class Integrator { protected: + typedef typename FieldImplementation::Field MomentaField; //for readability typedef typename FieldImplementation::Field Field; @@ -118,36 +119,58 @@ protected: } } update_P_hireps{}; + void update_P(MomentaField& Mom, Field& U, int level, double ep) { // input U actually not used in the fundamental case // Fundamental updates, include smearing for (int a = 0; a < as[level].actions.size(); ++a) { + double start_full = usecond(); Field force(U.Grid()); conformable(U.Grid(), Mom.Grid()); Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared); double start_force = usecond(); + as[level].actions.at(a)->deriv_timer_start(); as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta + as[level].actions.at(a)->deriv_timer_stop(); std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; + auto name = as[level].actions.at(a)->action_name(); if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); + force = FieldImplementation::projectForce(force); // Ta for gauge fields double end_force = usecond(); - Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); - std::cout << GridLogIntegrator << "["<applyFilter(force); + std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) + Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR; + + Real force_max = std::sqrt(maxLocalNorm2(force)); + Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR; + + as[level].actions.at(a)->deriv_log(force_abs,force_max); + + std::cout << GridLogIntegrator<< "["<applyFilter(Mom); } void update_U(Field& U, double ep) @@ -161,8 +184,12 @@ protected: void update_U(MomentaField& Mom, Field& U, double ep) { + MomentaField MomFiltered(Mom.Grid()); + MomFiltered = Mom; + MomFilter->applyFilter(MomFiltered); + // exponential of Mom*U in the gauge fields case - FieldImplementation::update_field(Mom, U, ep); + FieldImplementation::update_field(MomFiltered, U, ep); // Update the smeared fields, can be implemented as observer Smearer.set_Field(U); @@ -205,6 +232,66 @@ public: const MomentaField & getMomentum() const{ return P; } + void reset_timer(void) + { + for (int level = 0; level < as.size(); ++level) { + for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { + as[level].actions.at(actionID)->reset_timer(); + } + } + } + void print_timer(void) + { + std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::" << std::endl; + std::cout << GridLogMessage << " Refresh cumulative timings "<action_name() + <<"["<refresh_us*1.0e-6<<" s"<< std::endl; + } + } + std::cout << GridLogMessage << "--------------------------- "<action_name() + <<"["<S_us*1.0e-6<<" s"<< std::endl; + } + } + std::cout << GridLogMessage << "--------------------------- "<action_name() + <<"["<deriv_us*1.0e-6<<" s"<< std::endl; + } + } + std::cout << GridLogMessage << "--------------------------- "<action_name() + <<"["<deriv_max_average() + <<" norm " << as[level].actions.at(actionID)->deriv_norm_average() + <<" calls " << as[level].actions.at(actionID)->deriv_num + << std::endl; + } + } + std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; + } + void print_parameters() { std::cout << GridLogMessage << "[Integrator] Name : "<< integrator_name() << std::endl; @@ -223,7 +310,6 @@ public: } } std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; - } void reverse_momenta() @@ -266,15 +352,19 @@ public: for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { // get gauge field from the SmearingPolicy and // based on the boolean is_smeared in actionID + auto name = as[level].actions.at(actionID)->action_name(); + std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<is_smeared); + as[level].actions.at(actionID)->refresh_timer_start(); as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); + as[level].actions.at(actionID)->refresh_timer_stop(); } // Refresh the higher representation actions as[level].apply(refresh_hireps, Representations, sRNG, pRNG); } - MomFilter->applyFilter(P); } // to be used by the actionlevel class to iterate @@ -309,7 +399,9 @@ public: // based on the boolean is_smeared in actionID Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; + as[level].actions.at(actionID)->S_timer_start(); Hterm = as[level].actions.at(actionID)->S(Us); + as[level].actions.at(actionID)->S_timer_stop(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; } diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index e2430234..3e356a1c 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -1120,7 +1120,7 @@ public: bytes); } - if ( compress.DecompressionStep() && comms_recv ) { + if ( compress.DecompressionStep() ) { AddDecompress(&this->u_recv_buf_p[u_comm_offset], &recv_buf[u_comm_offset], words,Decompressions); @@ -1206,8 +1206,8 @@ public: face_table[face_idx].size()*sizeof(face_table_host[0])); } - if ( comms_send ) - Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); + // if ( comms_send ) + Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); face_idx++; //spointers[0] -- low @@ -1239,7 +1239,7 @@ public: rpointers[i] = rp; int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,bytes,cbmask); - if ( (!duplicate) ) { // Force comms for now + if ( !duplicate ) { AddPacket((void *)sp,(void *)rp, xmit_to_rank,comms_send, recv_from_rank,comms_recv, @@ -1253,9 +1253,7 @@ public: } } - if ( comms_recv ) { - AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); - } + AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); u_comm_offset +=buffer_size; } diff --git a/Grid/tensors/Tensor_exp.h b/Grid/tensors/Tensor_exp.h index 1f637d5f..e35467d4 100644 --- a/Grid/tensors/Tensor_exp.h +++ b/Grid/tensors/Tensor_exp.h @@ -55,7 +55,7 @@ template accelerator_inline iVector Exponentiate(c // Specialisation: Cayley-Hamilton exponential for SU(3) -#ifndef GRID_CUDA +#ifndef GRID_ACCELERATED template::TensorLevel == 0>::type * =nullptr> accelerator_inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) { diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index b427b304..517d3d3d 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -441,7 +441,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch { - hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream); + hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice); } inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); }; @@ -461,6 +461,8 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \ accelerator_barrier(dummy); +#define GRID_ACCELERATED + #endif ////////////////////////////////////////////// diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index e5cc2e63..f79797fa 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -143,7 +143,7 @@ void Benchmark(int Ls, Coordinate Dirichlet) GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); LatticeFermionF src (FGrid); random(RNG5,src); -#if 1 +#if 0 src = Zero(); { Coordinate origin({0,0,0,latt4[2]-1,0}); diff --git a/systems/Spock/config-command b/systems/Spock/config-command index 70c97c37..3ffefe4f 100644 --- a/systems/Spock/config-command +++ b/systems/Spock/config-command @@ -6,6 +6,8 @@ --enable-simd=GPU \ --disable-fermion-reps \ --disable-gparity \ +--with-gmp=$OLCF_GMP_ROOT \ +--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ CXX=hipcc MPICXX=mpicxx \ CXXFLAGS="-fPIC -I/opt/rocm-4.3.0/include/ -std=c++14 -I${MPICH_DIR}/include " \ --prefix=/ccs/home/chulwoo/Grid \ diff --git a/systems/Spock/dwf8.slurm b/systems/Spock/dwf8.slurm index c4672db0..f2e12b97 100644 --- a/systems/Spock/dwf8.slurm +++ b/systems/Spock/dwf8.slurm @@ -1,8 +1,7 @@ #!/bin/bash # Begin LSF Directives #SBATCH -A LGT104 -#SBATCH -t 01:00:00 -##SBATCH -U openmpThu +#SBATCH -t 3:00:00 #SBATCH -p ecp #SBATCH -J DWF #SBATCH -o DWF.%J @@ -14,13 +13,12 @@ DIR=. module list export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 export MPICH_GPU_SUPPORT_ENABLED=1 -#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM -export MPICH_SMP_SINGLE_COPY_MODE=NONE -#export MPICH_SMP_SINGLE_COPY_MODE=CMA +export MPICH_SMP_SINGLE_COPY_MODE=CMA + export OMP_NUM_THREADS=8 AT=8 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads ${AT} --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0" -srun -n8 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS +PARAMS=" --accelerator-threads ${AT} --grid 16.16.16.48 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0" +srun -N2 -n8 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./HMC/Mobius2p1f_DD_RHMC $PARAMS diff --git a/systems/Spock/sourceme.sh b/systems/Spock/sourceme.sh index 72a2ff4e..415341d0 100644 --- a/systems/Spock/sourceme.sh +++ b/systems/Spock/sourceme.sh @@ -1,5 +1,9 @@ +module load emacs module load PrgEnv-gnu module load rocm/4.5.0 module load gmp module load cray-fftw module load craype-accel-amd-gfx908 +export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +export MPICH_GPU_SUPPORT_ENABLED=1 +export LD_LIBRARY_PATH=/opt/cray/pe/gcc/mpfr/3.1.4/lib/:$LD_LIBRARY_PATH From b0f4eee78b0c3d6a99aa820b10c31f18619fee3e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Mar 2022 19:09:13 -0500 Subject: [PATCH 009/240] New files --- Grid/qcd/action/filters/DDHMCFilter.h | 91 ++++++++++ HMC/Mobius2p1f_DD_RHMC.cc | 232 ++++++++++++++++++++++++++ 2 files changed, 323 insertions(+) create mode 100644 Grid/qcd/action/filters/DDHMCFilter.h create mode 100644 HMC/Mobius2p1f_DD_RHMC.cc diff --git a/Grid/qcd/action/filters/DDHMCFilter.h b/Grid/qcd/action/filters/DDHMCFilter.h new file mode 100644 index 00000000..294a8d23 --- /dev/null +++ b/Grid/qcd/action/filters/DDHMCFilter.h @@ -0,0 +1,91 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h + +Copyright (C) 2015 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +//-------------------------------------------------------------------- +#pragma once + +NAMESPACE_BEGIN(Grid); +//////////////////////////////////////////////////// +// DDHMC filter with sub-block size B[mu] +//////////////////////////////////////////////////// + +template +struct DDHMCFilter: public MomentumFilterBase +{ + Coordinate Block; + int Width; + + DDHMCFilter(const Coordinate &_Block,int _Width=2): Block(_Block) { Width=_Width; } + + void applyFilter(GaugeField &U) const override + { + GridBase *grid = U.Grid(); + Coordinate Global=grid->GlobalDimensions(); + GaugeField zzz(grid); zzz = Zero(); + LatticeInteger coor(grid); + + auto zzz_mu = PeekIndex(zzz,0); + //////////////////////////////////////////////////// + // Zero BDY layers + //////////////////////////////////////////////////// + std::cout<(U,mu); + U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu); + PokeIndex(U, U_mu, mu); + } + if ( Width==2) { + U = where(mod(coor,B1)==Integer(B1-2),zzz,U); + U = where(mod(coor,B1)==Integer(B1-1),zzz,U); + U = where(mod(coor,B1)==Integer(0) ,zzz,U); + U = where(mod(coor,B1)==Integer(1) ,zzz,U); + auto U_mu = PeekIndex(U,mu); + U_mu = where(mod(coor,B1)==Integer(B1-3),zzz_mu,U_mu); + PokeIndex(U, U_mu, mu); + } + } + + } + + } +}; + +NAMESPACE_END(Grid); + diff --git a/HMC/Mobius2p1f_DD_RHMC.cc b/HMC/Mobius2p1f_DD_RHMC.cc new file mode 100644 index 00000000..eb9ea9cb --- /dev/null +++ b/HMC/Mobius2p1f_DD_RHMC.cc @@ -0,0 +1,232 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_hmc_EODWFRatio.cc + +Copyright (C) 2015-2016 + +Author: Peter Boyle +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + int threads = GridThread::GetThreads(); + + // Typedefs to simplify notation + typedef WilsonImplR FermionImplPolicy; + typedef MobiusFermionR FermionAction; + typedef typename FermionAction::FermionField FermionField; + + typedef Grid::XmlReader Serialiser; + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + IntegratorParameters MD; + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Leap Frog"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Force Gradient"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = 10; + MD.trajL = 1.0; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 0; + HMCparams.Trajectories = 200; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + HMCparams.StartingType =std::string("ColdStart"); + // HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_EODWF_lat"; + CPparams.rng_prefix = "ckpoint_EODWF_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + // Construct observables + // here there is too much indirection + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 16; + Real beta = 2.13; + Real light_mass = 0.01; + Real strange_mass = 0.04; + Real pv_mass = 1.0; + RealD M5 = 1.8; + RealD b = 1.0; + RealD c = 0.0; + + // FIXME: + // Same in MC and MD + // Need to mix precision too + OneFlavourRationalParams OFRp; + OFRp.lo = 4.0e-3; + OFRp.hi = 30.0; + OFRp.MaxIter = 10000; + OFRp.tolerance= 1.0e-10; + OFRp.degree = 16; + OFRp.precision= 50; + + std::vector hasenbusch({ 0.01, 0.04, 0.2 , pv_mass }); + std::vector dirichlet ({ true, true, true }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + Coordinate CommDim(Nd); + for(int d=0;d1 ? 1 : 0; + + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + + Coordinate Block4(Nd); + Block4[0] = Dirichlet[1]; + Block4[1] = Dirichlet[2]; + Block4[2] = Dirichlet[3]; + Block4[3] = Dirichlet[4]; + TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4)); + + ////////////////////////// + // Fermion Grid + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + + // These lines are unecessary if BC are all periodic + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + + double StoppingCondition = 1e-10; + double MaxCGIterations = 30000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(2); + ActionLevel Level3(4); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp); + // Level1.push_back(&StrangePseudoFermion); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + std::vector dirichlet_den; + std::vector dirichlet_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); + dirichlet_den.push_back(0); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + + for(int h=0;h(*Numerators[h],*Denominators[h],CG,CG)); + if ( dirichlet_den[h]==1) Denominators[h]->DirichletBlock(Dirichlet); + if ( dirichlet_num[h]==1) Numerators[h]->DirichletBlock(Dirichlet); + } + + int nquo=Quotients.size(); + Level1.push_back(Quotients[0]); + Level1.push_back(Quotients[nquo-1]); + for(int h=1;h Date: Thu, 3 Mar 2022 16:56:02 -0500 Subject: [PATCH 010/240] small DDHMC update --- Grid/qcd/action/filters/DDHMCFilter.h | 11 +++++++++++ HMC/Mobius2p1f_DD_RHMC.cc | 10 +++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/Grid/qcd/action/filters/DDHMCFilter.h b/Grid/qcd/action/filters/DDHMCFilter.h index 294a8d23..366b18e8 100644 --- a/Grid/qcd/action/filters/DDHMCFilter.h +++ b/Grid/qcd/action/filters/DDHMCFilter.h @@ -80,6 +80,17 @@ struct DDHMCFilter: public MomentumFilterBase U_mu = where(mod(coor,B1)==Integer(B1-3),zzz_mu,U_mu); PokeIndex(U, U_mu, mu); } + if ( Width==3) { + U = where(mod(coor,B1)==Integer(B1-3),zzz,U); + U = where(mod(coor,B1)==Integer(B1-2),zzz,U); + U = where(mod(coor,B1)==Integer(B1-1),zzz,U); + U = where(mod(coor,B1)==Integer(0) ,zzz,U); + U = where(mod(coor,B1)==Integer(1) ,zzz,U); + U = where(mod(coor,B1)==Integer(2) ,zzz,U); + auto U_mu = PeekIndex(U,mu); + U_mu = where(mod(coor,B1)==Integer(B1-4),zzz_mu,U_mu); + PokeIndex(U, U_mu, mu); + } } } diff --git a/HMC/Mobius2p1f_DD_RHMC.cc b/HMC/Mobius2p1f_DD_RHMC.cc index eb9ea9cb..a41972c2 100644 --- a/HMC/Mobius2p1f_DD_RHMC.cc +++ b/HMC/Mobius2p1f_DD_RHMC.cc @@ -50,16 +50,16 @@ int main(int argc, char **argv) { // MD.name = std::string("Force Gradient"); typedef GenericHMCRunner HMCWrapper; MD.name = std::string("MinimumNorm2"); - MD.MDsteps = 10; + MD.MDsteps = 4; MD.trajL = 1.0; HMCparameters HMCparams; - HMCparams.StartTrajectory = 0; + HMCparams.StartTrajectory = 8; HMCparams.Trajectories = 200; HMCparams.NoMetropolisUntil= 0; // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; - HMCparams.StartingType =std::string("ColdStart"); - // HMCparams.StartingType =std::string("CheckpointStart"); + // HMCparams.StartingType =std::string("ColdStart"); + HMCparams.StartingType =std::string("CheckpointStart"); HMCparams.MD = MD; HMCWrapper TheHMC(HMCparams); @@ -159,7 +159,7 @@ int main(int argc, char **argv) { //////////////////////////////////// ActionLevel Level1(1); ActionLevel Level2(2); - ActionLevel Level3(4); + ActionLevel Level3(8); //////////////////////////////////// // Strange action From bb5c16b97f1c40b7c131eae08ef58f989e2c07a4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Mar 2022 17:00:37 -0500 Subject: [PATCH 011/240] New scripts --- systems/Crusher/dwf.slurm | 20 ++++++++++---------- systems/Crusher/dwf8.slurm | 33 +++++++++++++++++++++++++++------ systems/Crusher/mpiwrapper.sh | 5 +++-- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/systems/Crusher/dwf.slurm b/systems/Crusher/dwf.slurm index 286615ef..8742ed76 100644 --- a/systems/Crusher/dwf.slurm +++ b/systems/Crusher/dwf.slurm @@ -3,28 +3,28 @@ #SBATCH -A LGT104 #SBATCH -t 01:00:00 ##SBATCH -U openmpThu -##SBATCH -p ecp #SBATCH -J DWF #SBATCH -o DWF.%J #SBATCH -e DWF.%J #SBATCH -N 1 -#SBATCH -n 1 -#SBATCH --exclusive +#SBATCH -n 8 +#SBATCH --exclusive +#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4 DIR=. module list -#export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 export MPICH_GPU_SUPPORT_ENABLED=1 -export MPICH_SMP_SINGLE_COPY_MODE=XPMEM -#export MPICH_SMP_SINGLE_COPY_MODE=NONE +#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +export MPICH_SMP_SINGLE_COPY_MODE=NONE #export MPICH_SMP_SINGLE_COPY_MODE=CMA export OMP_NUM_THREADS=1 -AT=8 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads ${AT} --grid 24.24.24.24 --shm-mpi 0 --mpi 1.1.1.1" - -srun --gpus-per-task 1 -n1 ./benchmarks/Benchmark_dwf_fp32 $PARAMS +PARAMS=" --accelerator-threads 16 --grid 32.32.32.256 --mpi 1.1.1.8 --comms-overlap --shm 2048 --shm-mpi 0" +echo $PARAMS +srun --gpus-per-task 1 -n8 ./benchmarks/Benchmark_dwf_fp32 $PARAMS + diff --git a/systems/Crusher/dwf8.slurm b/systems/Crusher/dwf8.slurm index 30e83fff..64572142 100644 --- a/systems/Crusher/dwf8.slurm +++ b/systems/Crusher/dwf8.slurm @@ -6,22 +6,43 @@ #SBATCH -J DWF #SBATCH -o DWF.%J #SBATCH -e DWF.%J -#SBATCH -N 1 -#SBATCH -n 8 +#SBATCH -N 8 +#SBATCH -n 64 #SBATCH --exclusive +#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4 DIR=. module list +export MPICH_OFI_NIC_POLICY=GPU export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 export MPICH_GPU_SUPPORT_ENABLED=1 -export MPICH_SMP_SINGLE_COPY_MODE=XPMEM -#export MPICH_SMP_SINGLE_COPY_MODE=NONE +#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM #export MPICH_SMP_SINGLE_COPY_MODE=CMA +export MPICH_SMP_SINGLE_COPY_MODE=NONE export OMP_NUM_THREADS=1 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads 8 --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0" -srun --gpus-per-task 1 -n8 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS +PARAMS=" --accelerator-threads 16 --grid 64.64.64.256 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0" +echo $PARAMS +#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.256.8node + + +PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 1" +echo $PARAMS +srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node + +PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 0" +echo $PARAMS +#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node.shm0 + +PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1" +echo $PARAMS +#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node + +PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0" +echo $PARAMS +#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0 + diff --git a/systems/Crusher/mpiwrapper.sh b/systems/Crusher/mpiwrapper.sh index 76c4e364..f6a56698 100755 --- a/systems/Crusher/mpiwrapper.sh +++ b/systems/Crusher/mpiwrapper.sh @@ -1,10 +1,11 @@ #!/bin/bash lrank=$SLURM_LOCALID +lgpu=(0 1 2 3 7 6 5 4) -export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID +export ROCR_VISIBLE_DEVICES=${lgpu[$lrank]} -echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING" +echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES " $* From 387397374a346a623e65e64ac42d9ee7cab28da6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Mar 2022 16:35:11 -0400 Subject: [PATCH 012/240] Current run options --- systems/Crusher/config-command | 2 ++ systems/Crusher/sourceme.sh | 1 + 2 files changed, 3 insertions(+) diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index 90737808..34123e6e 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -5,6 +5,8 @@ --enable-gen-simd-width=64 \ --enable-simd=GPU \ --disable-fermion-reps \ +--with-gmp=$OLCF_GMP_ROOT \ +--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ --disable-gparity \ CXX=hipcc MPICXX=mpicxx \ CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \ diff --git a/systems/Crusher/sourceme.sh b/systems/Crusher/sourceme.sh index 3f400ca4..83bfe57c 100644 --- a/systems/Crusher/sourceme.sh +++ b/systems/Crusher/sourceme.sh @@ -3,3 +3,4 @@ module load rocm/4.5.0 module load gmp module load cray-fftw module load craype-accel-amd-gfx90a +export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH From 83f818a99d75ed728502d9f5f701852bb3a4be84 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Apr 2022 16:24:34 -0400 Subject: [PATCH 013/240] Updates for DDHMC --- Grid/qcd/action/ActionParams.h | 5 +- Grid/qcd/action/fermion/CayleyFermion5D.h | 2 +- Grid/qcd/action/pseudofermion/Bounds.h | 25 ++ ...osedBoundaryTwoFlavourBosonPseudoFermion.h | 163 ++++++++++++ ...ecomposedBoundaryTwoFlavourPseudoFermion.h | 158 ++++++++++++ ...osedBoundaryTwoFlavourRatioPseudoFermion.h | 237 ++++++++++++++++++ .../OneFlavourEvenOddRationalRatio.h | 12 +- .../pseudofermion/OneFlavourRationalRatio.h | 20 +- .../pseudofermion/TwoFlavourEvenOddRatio.h | 8 +- .../TwoFlavourRatioEO4DPseudoFermion.h | 203 +++++++++++++++ Grid/qcd/hmc/integrators/Integrator.h | 1 + HMC/Mobius2p1f_DD_RHMC.cc | 95 ++++--- systems/Crusher/comms.slurm | 26 ++ 13 files changed, 911 insertions(+), 44 deletions(-) create mode 100644 Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion.h create mode 100644 Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourPseudoFermion.h create mode 100644 Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion.h create mode 100644 Grid/qcd/action/pseudofermion/TwoFlavourRatioEO4DPseudoFermion.h create mode 100644 systems/Crusher/comms.slurm diff --git a/Grid/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h index 0e6a11c6..6a3f053a 100644 --- a/Grid/qcd/action/ActionParams.h +++ b/Grid/qcd/action/ActionParams.h @@ -63,6 +63,7 @@ struct StaggeredImplParams { RealD, hi, int, MaxIter, RealD, tolerance, + RealD, mdtolerance, int, degree, int, precision, int, BoundsCheckFreq); @@ -76,11 +77,13 @@ struct StaggeredImplParams { RealD tol = 1.0e-8, int _degree = 10, int _precision = 64, - int _BoundsCheckFreq=20) + int _BoundsCheckFreq=20, + RealD mdtol = 1.0e-6) : lo(_lo), hi(_hi), MaxIter(_maxit), tolerance(tol), + mdtolerance(mdtol), degree(_degree), precision(_precision), BoundsCheckFreq(_BoundsCheckFreq){}; diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index c7d68d73..489b51ff 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -68,7 +68,7 @@ public: /////////////////////////////////////////////////////////////// // Support for MADWF tricks /////////////////////////////////////////////////////////////// - RealD Mass(void) { return mass; }; + virtual RealD Mass(void) { return mass; }; void SetMass(RealD _mass) { mass=_mass; SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs diff --git a/Grid/qcd/action/pseudofermion/Bounds.h b/Grid/qcd/action/pseudofermion/Bounds.h index 535e1a49..b9621f24 100644 --- a/Grid/qcd/action/pseudofermion/Bounds.h +++ b/Grid/qcd/action/pseudofermion/Bounds.h @@ -13,6 +13,31 @@ NAMESPACE_BEGIN(Grid); std::cout << GridLogMessage << "Pseudofermion action lamda_max "< void ChebyBoundsCheck(LinearOperatorBase &HermOp, + Field &GaussNoise, + RealD lo,RealD hi) + { + int orderfilter = 1000; + Chebyshev Cheb(lo,hi,orderfilter); + + GridBase *FermionGrid = GaussNoise.Grid(); + + Field X(FermionGrid); + Field Z(FermionGrid); + + X=GaussNoise; + RealD Nx = norm2(X); + Cheb(HermOp,X,Z); + RealD Nz = norm2(Z); + + std::cout << "************************* "< void InverseSqrtBoundsCheck(int MaxIter,double tol, LinearOperatorBase &HermOp, diff --git a/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion.h b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion.h new file mode 100644 index 00000000..d00ae894 --- /dev/null +++ b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion.h @@ -0,0 +1,163 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundaryBoson.h + + Copyright (C) 2021 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +/////////////////////////////////////// +// Two flavour ratio +/////////////////////////////////////// +template +class DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion : public Action { +public: + INHERIT_IMPL_TYPES(ImplD); + +private: + SchurFactoredFermionOperator & NumOp;// the basic operator + RealD InnerStoppingCondition; + RealD ActionStoppingCondition; + RealD DerivativeStoppingCondition; + FermionField Phi; // the pseudo fermion field for this trajectory +public: + DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion(SchurFactoredFermionOperator &_NumOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6) + : NumOp(_NumOp), + DerivativeStoppingCondition(_DerivativeTol), + ActionStoppingCondition(_ActionTol), + InnerStoppingCondition(_InnerTol), + Phi(_NumOp.FermionGrid()) {}; + + virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourBosonPseudoFermion";} + + virtual std::string LogParameters(){ + std::stringstream sstream; + return sstream.str(); + } + + virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG) + { + // P(phi) = e^{- phi^dag P^dag P phi} + // + // NumOp == P + // + // Take phi = P^{-1} eta ; eta = P Phi + // + // P(eta) = e^{- eta^dag eta} + // + // e^{x^2/2 sig^2} => sig^2 = 0.5. + // + // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707.... + // + RealD scale = std::sqrt(0.5); + + NumOp.tolinner=InnerStoppingCondition; + NumOp.tol=ActionStoppingCondition; + NumOp.ImportGauge(U); + + FermionField eta(NumOp.FermionGrid()); + + gaussian(pRNG,eta); eta=eta*scale; + + NumOp.ProjectBoundaryBar(eta); + //DumpSliceNorm("eta",eta); + NumOp.RInv(eta,Phi); + + //DumpSliceNorm("Phi",Phi); + + }; + + ////////////////////////////////////////////////////// + // S = phi^dag Pdag P phi + ////////////////////////////////////////////////////// + virtual RealD S(const GaugeField &U) { + + NumOp.tolinner=InnerStoppingCondition; + NumOp.tol=ActionStoppingCondition; + NumOp.ImportGauge(U); + + FermionField Y(NumOp.FermionGrid()); + + NumOp.R(Phi,Y); + + RealD action = norm2(Y); + + return action; + }; + + virtual void deriv(const GaugeField &U,GaugeField & dSdU) + { + NumOp.tolinner=InnerStoppingCondition; + NumOp.tol=DerivativeStoppingCondition; + NumOp.ImportGauge(U); + + GridBase *fgrid = NumOp.FermionGrid(); + GridBase *ugrid = NumOp.GaugeGrid(); + + FermionField X(fgrid); + FermionField Y(fgrid); + FermionField tmp(fgrid); + + GaugeField force(ugrid); + + FermionField DobiDdbPhi(fgrid); // Vector A in my notes + FermionField DoiDdDobiDdbPhi(fgrid); // Vector B in my notes + FermionField DoidP_Phi(fgrid); // Vector E in my notes + FermionField DobidDddDoidP_Phi(fgrid); // Vector F in my notes + + FermionField P_Phi(fgrid); + + // P term + NumOp.dBoundaryBar(Phi,tmp); + NumOp.dOmegaBarInv(tmp,DobiDdbPhi); // Vector A + NumOp.dBoundary(DobiDdbPhi,tmp); + NumOp.dOmegaInv(tmp,DoiDdDobiDdbPhi); // Vector B + P_Phi = Phi - DoiDdDobiDdbPhi; + NumOp.ProjectBoundaryBar(P_Phi); + + // P^dag P term + NumOp.dOmegaDagInv(P_Phi,DoidP_Phi); // Vector E + NumOp.dBoundaryDag(DoidP_Phi,tmp); + NumOp.dOmegaBarDagInv(tmp,DobidDddDoidP_Phi); // Vector F + NumOp.dBoundaryBarDag(DobidDddDoidP_Phi,tmp); + + X = DobiDdbPhi; + Y = DobidDddDoidP_Phi; + NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo); dSdU=force; + NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes); dSdU=dSdU+force; + + X = DoiDdDobiDdbPhi; + Y = DoidP_Phi; + NumOp.DirichletFermOpD.MDeriv(force,Y,X,DaggerNo); dSdU=dSdU+force; + NumOp.DirichletFermOpD.MDeriv(force,X,Y,DaggerYes); dSdU=dSdU+force; + + dSdU *= -1.0; + + }; +}; + +NAMESPACE_END(Grid); + diff --git a/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourPseudoFermion.h b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourPseudoFermion.h new file mode 100644 index 00000000..1f3687ca --- /dev/null +++ b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourPseudoFermion.h @@ -0,0 +1,158 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h + + Copyright (C) 2021 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +/////////////////////////////////////// +// Two flavour ratio +/////////////////////////////////////// +template +class DomainDecomposedBoundaryTwoFlavourPseudoFermion : public Action { +public: + INHERIT_IMPL_TYPES(ImplD); + +private: + SchurFactoredFermionOperator & DenOp;// the basic operator + RealD ActionStoppingCondition; + RealD DerivativeStoppingCondition; + RealD InnerStoppingCondition; + + FermionField Phi; // the pseudo fermion field for this trajectory + + RealD refresh_action; +public: + DomainDecomposedBoundaryTwoFlavourPseudoFermion(SchurFactoredFermionOperator &_DenOp,RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol = 1.0e-6 ) + : DenOp(_DenOp), + DerivativeStoppingCondition(_DerivativeTol), + ActionStoppingCondition(_ActionTol), + InnerStoppingCondition(_InnerTol), + Phi(_DenOp.FermionGrid()) {}; + + virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourPseudoFermion";} + + + virtual std::string LogParameters(){ + std::stringstream sstream; + return sstream.str(); + } + + virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG) + { + // P(phi) = e^{- phi^dag Rdag^-1 R^-1 phi} + // + // DenOp == R + // + // Take phi = R eta ; eta = R^-1 Phi + // + // P(eta) = e^{- eta^dag eta} + // + // e^{x^2/2 sig^2} => sig^2 = 0.5. + // + // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707.... + // + RealD scale = std::sqrt(0.5); + + DenOp.tolinner=InnerStoppingCondition; + DenOp.tol =ActionStoppingCondition; + DenOp.ImportGauge(U); + + FermionField eta(DenOp.FermionGrid()); + + gaussian(pRNG,eta); eta=eta*scale; + + DenOp.ProjectBoundaryBar(eta); + DenOp.R(eta,Phi); + //DumpSliceNorm("Phi",Phi); + refresh_action = norm2(eta); + }; + + ////////////////////////////////////////////////////// + // S = phi^dag Rdag^-1 R^-1 phi + ////////////////////////////////////////////////////// + virtual RealD S(const GaugeField &U) { + + DenOp.tolinner=InnerStoppingCondition; + DenOp.tol=ActionStoppingCondition; + DenOp.ImportGauge(U); + + FermionField X(DenOp.FermionGrid()); + + DenOp.RInv(Phi,X); + + RealD action = norm2(X); + + return action; + }; + + virtual void deriv(const GaugeField &U,GaugeField & dSdU) + { + DenOp.tolinner=InnerStoppingCondition; + DenOp.tol=DerivativeStoppingCondition; + DenOp.ImportGauge(U); + + GridBase *fgrid = DenOp.FermionGrid(); + GridBase *ugrid = DenOp.GaugeGrid(); + + FermionField X(fgrid); + FermionField Y(fgrid); + FermionField tmp(fgrid); + + GaugeField force(ugrid); + + FermionField DiDdb_Phi(fgrid); // Vector C in my notes + FermionField DidRinv_Phi(fgrid); // Vector D in my notes + FermionField Rinv_Phi(fgrid); + +// FermionField RinvDagRinv_Phi(fgrid); +// FermionField DdbdDidRinv_Phi(fgrid); + + // R^-1 term + DenOp.dBoundaryBar(Phi,tmp); + DenOp.Dinverse(tmp,DiDdb_Phi); // Vector C + Rinv_Phi = Phi - DiDdb_Phi; + DenOp.ProjectBoundaryBar(Rinv_Phi); + + // R^-dagger R^-1 term + DenOp.DinverseDag(Rinv_Phi,DidRinv_Phi); // Vector D +/* + DenOp.dBoundaryBarDag(DidRinv_Phi,DdbdDidRinv_Phi); + RinvDagRinv_Phi = Rinv_Phi - DdbdDidRinv_Phi; + DenOp.ProjectBoundaryBar(RinvDagRinv_Phi); +*/ + X = DiDdb_Phi; + Y = DidRinv_Phi; + DenOp.PeriodicFermOpD.MDeriv(force,Y,X,DaggerNo); dSdU=force; + DenOp.PeriodicFermOpD.MDeriv(force,X,Y,DaggerYes); dSdU=dSdU+force; + DumpSliceNorm("force",dSdU); + dSdU *= -1.0; + }; +}; + +NAMESPACE_END(Grid); + diff --git a/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion.h b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion.h new file mode 100644 index 00000000..cb9ce0a4 --- /dev/null +++ b/Grid/qcd/action/pseudofermion/DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion.h @@ -0,0 +1,237 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/pseudofermion/DomainDecomposedTwoFlavourBoundary.h + + Copyright (C) 2021 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +/////////////////////////////////////// +// Two flavour ratio +/////////////////////////////////////// +template +class DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion : public Action { +public: + INHERIT_IMPL_TYPES(ImplD); + +private: + SchurFactoredFermionOperator & NumOp;// the basic operator + SchurFactoredFermionOperator & DenOp;// the basic operator + + RealD InnerStoppingCondition; + RealD ActionStoppingCondition; + RealD DerivativeStoppingCondition; + + FermionField Phi; // the pseudo fermion field for this trajectory + +public: + DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion(SchurFactoredFermionOperator &_NumOp, + SchurFactoredFermionOperator &_DenOp, + RealD _DerivativeTol, RealD _ActionTol, RealD _InnerTol=1.0e-6) + : NumOp(_NumOp), DenOp(_DenOp), + Phi(_NumOp.PeriodicFermOpD.FermionGrid()), + InnerStoppingCondition(_InnerTol), + DerivativeStoppingCondition(_DerivativeTol), + ActionStoppingCondition(_ActionTol) + {}; + + virtual std::string action_name(){return "DomainDecomposedBoundaryTwoFlavourRatioPseudoFermion";} + + virtual std::string LogParameters(){ + std::stringstream sstream; + return sstream.str(); + } + + virtual void refresh(const GaugeField &U, GridSerialRNG& sRNG, GridParallelRNG& pRNG) + { + NumOp.ImportGauge(U); + DenOp.ImportGauge(U); + + FermionField eta(NumOp.PeriodicFermOpD.FermionGrid()); + FermionField tmp(NumOp.PeriodicFermOpD.FermionGrid()); + + // P(phi) = e^{- phi^dag P^dag Rdag^-1 R^-1 P phi} + // + // NumOp == P + // DenOp == R + // + // Take phi = P^{-1} R eta ; eta = R^-1 P Phi + // + // P(eta) = e^{- eta^dag eta} + // + // e^{x^2/2 sig^2} => sig^2 = 0.5. + // + // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707.... + // + RealD scale = std::sqrt(0.5); + + gaussian(pRNG,eta); eta=eta*scale; + + NumOp.ProjectBoundaryBar(eta); + NumOp.tolinner=InnerStoppingCondition; + DenOp.tolinner=InnerStoppingCondition; + DenOp.tol = ActionStoppingCondition; + NumOp.tol = ActionStoppingCondition; + DenOp.R(eta,tmp); + NumOp.RInv(tmp,Phi); + DumpSliceNorm("Phi",Phi); + + }; + + ////////////////////////////////////////////////////// + // S = phi^dag Pdag Rdag^-1 R^-1 P phi + ////////////////////////////////////////////////////// + virtual RealD S(const GaugeField &U) { + + NumOp.ImportGauge(U); + DenOp.ImportGauge(U); + + FermionField X(NumOp.PeriodicFermOpD.FermionGrid()); + FermionField Y(NumOp.PeriodicFermOpD.FermionGrid()); + + NumOp.tolinner=InnerStoppingCondition; + DenOp.tolinner=InnerStoppingCondition; + DenOp.tol = ActionStoppingCondition; + NumOp.tol = ActionStoppingCondition; + NumOp.R(Phi,Y); + DenOp.RInv(Y,X); + + RealD action = norm2(X); + // std::cout << " DD boundary action is " < & DenOp;// the basic operator FermionField PhiEven; // the pseudo fermion field for this trajectory FermionField PhiOdd; // the pseudo fermion field for this trajectory + FermionField Noise; // spare noise field for bounds check public: @@ -70,6 +71,7 @@ NAMESPACE_BEGIN(Grid); DenOp(_DenOp), PhiOdd (_NumOp.FermionRedBlackGrid()), PhiEven(_NumOp.FermionRedBlackGrid()), + Noise(_NumOp.FermionRedBlackGrid()), param(p) { AlgRemez remez(param.lo,param.hi,param.precision); @@ -87,7 +89,11 @@ NAMESPACE_BEGIN(Grid); PowerNegQuarter.Init(remez,param.tolerance,true); }; - virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";} + virtual std::string action_name(){ + std::stringstream sstream; + sstream<< "OneFlavourEvenOddRatioRationalPseudoFermionAction det("<< DenOp.Mass() << ") / det("<Broadcast(0,r); if ( (r%param.BoundsCheckFreq)==0 ) { FermionField gauss(NumOp.FermionRedBlackGrid()); - gauss = PhiOdd; + gauss = Noise; HighBoundCheck(MdagM,gauss,param.hi); InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf); + ChebyBoundsCheck(MdagM,Noise,param.lo,param.hi); } // Phidag VdagV^1/4 MdagM^-1/4 MdagM^-1/4 VdagV^1/4 Phi diff --git a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h index 128c869a..f8e2e703 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h @@ -49,10 +49,12 @@ NAMESPACE_BEGIN(Grid); Params param; MultiShiftFunction PowerHalf ; - MultiShiftFunction PowerNegHalf; MultiShiftFunction PowerQuarter; + MultiShiftFunction PowerNegHalf; MultiShiftFunction PowerNegQuarter; + MultiShiftFunction MDPowerQuarter; + MultiShiftFunction MDPowerNegHalf; private: FermionOperator & NumOp;// the basic operator @@ -79,6 +81,10 @@ NAMESPACE_BEGIN(Grid); remez.generateApprox(param.degree,1,4); PowerQuarter.Init(remez,param.tolerance,false); PowerNegQuarter.Init(remez,param.tolerance,true); + + // Derive solves different tol + MDPowerQuarter.Init(remez,param.mdtolerance,false); + MDPowerNegHalf.Init(remez,param.mdtolerance,true); }; virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";} @@ -204,8 +210,8 @@ NAMESPACE_BEGIN(Grid); virtual void deriv(const GaugeField &U,GaugeField & dSdU) { - const int n_f = PowerNegHalf.poles.size(); - const int n_pv = PowerQuarter.poles.size(); + const int n_f = MDPowerNegHalf.poles.size(); + const int n_pv = MDPowerQuarter.poles.size(); std::vector MpvPhi_k (n_pv,NumOp.FermionGrid()); std::vector MpvMfMpvPhi_k(n_pv,NumOp.FermionGrid()); @@ -224,8 +230,8 @@ NAMESPACE_BEGIN(Grid); MdagMLinearOperator ,FermionField> MdagM(DenOp); MdagMLinearOperator ,FermionField> VdagV(NumOp); - ConjugateGradientMultiShift msCG_V(param.MaxIter,PowerQuarter); - ConjugateGradientMultiShift msCG_M(param.MaxIter,PowerNegHalf); + ConjugateGradientMultiShift msCG_V(param.MaxIter,MDPowerQuarter); + ConjugateGradientMultiShift msCG_M(param.MaxIter,MDPowerNegHalf); msCG_V(VdagV,Phi,MpvPhi_k,MpvPhi); msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi); @@ -244,7 +250,7 @@ NAMESPACE_BEGIN(Grid); //(1) for(int k=0;k +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +/////////////////////////////////////// +// Two flavour ratio +/////////////////////////////////////// +template +class TwoFlavourRatioEO4DPseudoFermionAction : public Action { +public: + INHERIT_IMPL_TYPES(Impl); + +private: + typedef FermionOperator FermOp; + FermionOperator & NumOp;// the basic operator + FermionOperator & DenOp;// the basic operator + + OperatorFunction &DerivativeSolver; + OperatorFunction &DerivativeDagSolver; + OperatorFunction &ActionSolver; + OperatorFunction &HeatbathSolver; + + FermionField phi4; // the pseudo fermion field for this trajectory + +public: + TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator &_NumOp, + FermionOperator &_DenOp, + OperatorFunction & DS, + OperatorFunction & AS ) : + TwoFlavourRatioEO4DPseudoFermionAction(_NumOp,_DenOp, DS,DS,AS,AS) {}; + TwoFlavourRatioEO4DPseudoFermionAction(FermionOperator &_NumOp, + FermionOperator &_DenOp, + OperatorFunction & DS, + OperatorFunction & DDS, + OperatorFunction & AS, + OperatorFunction & HS + ) : NumOp(_NumOp), + DenOp(_DenOp), + DerivativeSolver(DS), + DerivativeDagSolver(DDS), + ActionSolver(AS), + HeatbathSolver(HS), + phi4(_NumOp.GaugeGrid()) + {}; + + virtual std::string action_name(){return "TwoFlavourRatioEO4DPseudoFermionAction";} + + virtual std::string LogParameters(){ + std::stringstream sstream; + sstream << GridLogMessage << "["< sig^2 = 0.5. + // + // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707.... + // + RealD scale = std::sqrt(0.5); + + FermionField eta4(NumOp.GaugeGrid()); + FermionField eta5(NumOp.FermionGrid()); + FermionField tmp(NumOp.FermionGrid()); + FermionField phi5(NumOp.FermionGrid()); + + gaussian(pRNG,eta4); + NumOp.ImportFourDimPseudoFermion(eta4,eta5); + NumOp.ImportGauge(U); + DenOp.ImportGauge(U); + + SchurRedBlackDiagMooeeSolve PrecSolve(HeatbathSolver); + + DenOp.M(eta5,tmp); // M eta + PrecSolve(NumOp,tmp,phi5); // phi = V^-1 M eta + phi5=phi5*scale; + std::cout << GridLogMessage << "4d pf refresh "<< norm2(phi5)<<"\n"; + // Project to 4d + NumOp.ExportFourDimPseudoFermion(phi5,phi4); + + }; + + ////////////////////////////////////////////////////// + // S = phi^dag (V^dag M^-dag)_11 (M^-1 V)_11 phi + ////////////////////////////////////////////////////// + virtual RealD S(const GaugeField &U) { + + NumOp.ImportGauge(U); + DenOp.ImportGauge(U); + + FermionField Y4(NumOp.GaugeGrid()); + FermionField X(NumOp.FermionGrid()); + FermionField Y(NumOp.FermionGrid()); + FermionField phi5(NumOp.FermionGrid()); + + MdagMLinearOperator ,FermionField> MdagMOp(DenOp); + SchurRedBlackDiagMooeeSolve PrecSolve(ActionSolver); + + NumOp.ImportFourDimPseudoFermion(phi4,phi5); + NumOp.M(phi5,X); // X= V phi + PrecSolve(DenOp,X,Y); // Y= (MdagM)^-1 Mdag Vdag phi = M^-1 V phi + NumOp.ExportFourDimPseudoFermion(Y,Y4); + + RealD action = norm2(Y4); + + return action; + }; + + ////////////////////////////////////////////////////// + // dS/du = 2 Re phi^dag (V^dag M^-dag)_11 (M^-1 d V)_11 phi + // - 2 Re phi^dag (dV^dag M^-dag)_11 (M^-1 dM M^-1 V)_11 phi + ////////////////////////////////////////////////////// + virtual void deriv(const GaugeField &U,GaugeField & dSdU) { + + NumOp.ImportGauge(U); + DenOp.ImportGauge(U); + + FermionField X(NumOp.FermionGrid()); + FermionField Y(NumOp.FermionGrid()); + FermionField phi(NumOp.FermionGrid()); + FermionField Vphi(NumOp.FermionGrid()); + FermionField MinvVphi(NumOp.FermionGrid()); + FermionField tmp4(NumOp.GaugeGrid()); + FermionField MdagInvMinvVphi(NumOp.FermionGrid()); + + GaugeField force(NumOp.GaugeGrid()); + + //Y=V phi + //X = (Mdag V phi + //Y = (Mdag M)^-1 Mdag V phi = M^-1 V Phi + NumOp.ImportFourDimPseudoFermion(phi4,phi); + NumOp.M(phi,Vphi); // V phi + SchurRedBlackDiagMooeeSolve PrecSolve(DerivativeSolver); + PrecSolve(DenOp,Vphi,MinvVphi);// M^-1 V phi + std::cout << GridLogMessage << "4d deriv solve "<< norm2(MinvVphi)<<"\n"; + + // Projects onto the physical space and back + NumOp.ExportFourDimPseudoFermion(MinvVphi,tmp4); + NumOp.ImportFourDimPseudoFermion(tmp4,Y); + + SchurRedBlackDiagMooeeDagSolve PrecDagSolve(DerivativeDagSolver); + // X = proj M^-dag V phi + // Need an adjoint solve + PrecDagSolve(DenOp,Y,MdagInvMinvVphi); + std::cout << GridLogMessage << "4d deriv solve dag "<< norm2(MdagInvMinvVphi)<<"\n"; + + // phi^dag (Vdag Mdag^-1) (M^-1 dV) phi + NumOp.MDeriv(force ,MdagInvMinvVphi , phi, DaggerNo ); dSdU=force; + + // phi^dag (dVdag Mdag^-1) (M^-1 V) phi + NumOp.MDeriv(force , phi, MdagInvMinvVphi ,DaggerYes ); dSdU=dSdU+force; + + // - 2 Re phi^dag (dV^dag M^-dag)_11 (M^-1 dM M^-1 V)_11 phi + DenOp.MDeriv(force,MdagInvMinvVphi,MinvVphi,DaggerNo); dSdU=dSdU-force; + DenOp.MDeriv(force,MinvVphi,MdagInvMinvVphi,DaggerYes); dSdU=dSdU-force; + + dSdU *= -1.0; + //dSdU = - Ta(dSdU); + + }; +}; + +NAMESPACE_END(Grid); + + diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 1985caf0..6ad0e078 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -143,6 +143,7 @@ protected: force = FieldImplementation::projectForce(force); // Ta for gauge fields double end_force = usecond(); + DumpSliceNorm("force ",force,Nd-1); MomFilter->applyFilter(force); std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["< hasenbusch({ light_mass, 0.04, 0.25, 0.4, 0.7 , pv_mass }); // FIXME: // Same in MC and MD // Need to mix precision too + OneFlavourRationalParams SFRp; + SFRp.lo = 4.0e-3; + SFRp.hi = 30.0; + SFRp.MaxIter = 10000; + SFRp.tolerance= 1.0e-8; + SFRp.mdtolerance= 1.0e-6; + SFRp.degree = 16; + SFRp.precision= 50; + SFRp.BoundsCheckFreq=5; + OneFlavourRationalParams OFRp; - OFRp.lo = 4.0e-3; + OFRp.lo = 1.0e-4; OFRp.hi = 30.0; OFRp.MaxIter = 10000; - OFRp.tolerance= 1.0e-10; + OFRp.tolerance= 1.0e-8; + OFRp.mdtolerance= 1.0e-6; OFRp.degree = 16; OFRp.precision= 50; - - std::vector hasenbusch({ 0.01, 0.04, 0.2 , pv_mass }); - std::vector dirichlet ({ true, true, true }); + OFRp.BoundsCheckFreq=5; auto GridPtr = TheHMC.Resources.GetCartesian(); auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); @@ -133,7 +143,8 @@ int main(int argc, char **argv) { Block4[1] = Dirichlet[2]; Block4[2] = Dirichlet[3]; Block4[3] = Dirichlet[4]; - TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4)); + int Width=3; + TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4,Width)); ////////////////////////// // Fermion Grid @@ -150,7 +161,7 @@ int main(int argc, char **argv) { std::vector boundary = {1,1,1,-1}; FermionAction::ImplParams Params(boundary); - double StoppingCondition = 1e-10; + double StoppingCondition = 1e-8; double MaxCGIterations = 30000; ConjugateGradient CG(StoppingCondition,MaxCGIterations); @@ -158,8 +169,8 @@ int main(int argc, char **argv) { // Collect actions //////////////////////////////////// ActionLevel Level1(1); - ActionLevel Level2(2); - ActionLevel Level3(8); + ActionLevel Level2(4); + ActionLevel Level3(6); //////////////////////////////////// // Strange action @@ -167,8 +178,17 @@ int main(int argc, char **argv) { FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); - OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp); - // Level1.push_back(&StrangePseudoFermion); + FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + StrangeOpDir.DirichletBlock(Dirichlet); + StrangePauliVillarsOpDir.DirichletBlock(Dirichlet); + + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp); + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp); + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp); + Level1.push_back(&StrangePseudoFermionBdy); + Level2.push_back(&StrangePseudoFermionLocal); + Level1.push_back(&StrangePseudoFermionPVBdy); //////////////////////////////////// // up down action @@ -179,37 +199,49 @@ int main(int argc, char **argv) { std::vector dirichlet_num; int n_hasenbusch = hasenbusch.size(); - light_den.push_back(light_mass); - dirichlet_den.push_back(0); + light_den.push_back(light_mass); dirichlet_den.push_back(0); for(int h=0;h Numerators; std::vector Denominators; std::vector *> Quotients; + std::vector *> Bdys; for(int h=0;h(*Numerators[h],*Denominators[h],CG,CG)); + if(h!=0) { + Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction(*Numerators[h],*Denominators[h],CG,CG)); + } else { + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + } if ( dirichlet_den[h]==1) Denominators[h]->DirichletBlock(Dirichlet); if ( dirichlet_num[h]==1) Numerators[h]->DirichletBlock(Dirichlet); } int nquo=Quotients.size(); - Level1.push_back(Quotients[0]); - Level1.push_back(Quotients[nquo-1]); - for(int h=1;h Date: Tue, 5 Apr 2022 16:25:22 -0400 Subject: [PATCH 014/240] Tone down printing in integrator --- Grid/qcd/hmc/integrators/Integrator.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 6ad0e078..070cbea1 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -143,10 +143,9 @@ protected: force = FieldImplementation::projectForce(force); // Ta for gauge fields double end_force = usecond(); - DumpSliceNorm("force ",force,Nd-1); MomFilter->applyFilter(force); std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR; From ef820a26cd84e9789231998dc927989a10868560 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Apr 2022 16:49:02 -0400 Subject: [PATCH 015/240] Bcopy on crusher compile --- Grid/GridStd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/GridStd.h b/Grid/GridStd.h index 28f6bc46..d0a8124a 100644 --- a/Grid/GridStd.h +++ b/Grid/GridStd.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include From f77f3a6598decb2c4e3ef55bcedd1dacbc3623d0 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 6 Apr 2022 10:21:04 -0400 Subject: [PATCH 016/240] Imported G-parity flavor algebra + tester from feature/gparity_HMC branch --- Grid/GridQCDcore.h | 1 + Grid/qcd/QCD.h | 25 ++ Grid/qcd/gparity/Gparity.h | 6 + Grid/qcd/gparity/GparityFlavour.cc | 34 +++ Grid/qcd/gparity/GparityFlavour.h | 475 +++++++++++++++++++++++++++++ tests/core/Test_gparity_flavour.cc | 177 +++++++++++ 6 files changed, 718 insertions(+) create mode 100644 Grid/qcd/gparity/Gparity.h create mode 100644 Grid/qcd/gparity/GparityFlavour.cc create mode 100644 Grid/qcd/gparity/GparityFlavour.h create mode 100644 tests/core/Test_gparity_flavour.cc diff --git a/Grid/GridQCDcore.h b/Grid/GridQCDcore.h index cae6f43f..065b62cd 100644 --- a/Grid/GridQCDcore.h +++ b/Grid/GridQCDcore.h @@ -36,6 +36,7 @@ Author: paboyle #include #include #include +#include #include #include NAMESPACE_CHECK(GridQCDCore); diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h index 858aead7..81356a66 100644 --- a/Grid/qcd/QCD.h +++ b/Grid/qcd/QCD.h @@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range #define ColourIndex (2) #define SpinIndex (1) #define LorentzIndex (0) +#define GparityFlavourIndex (0) // Also should make these a named enum type static constexpr int DaggerNo=0; @@ -87,6 +88,8 @@ template struct isCoarsened { template using IfCoarsened = Invoke::value,int> > ; template using IfNotCoarsened = Invoke::value,int> > ; +const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom! + // ChrisK very keen to add extra space for Gparity doubling. // // Also add domain wall index, in a way where Wilson operator @@ -110,8 +113,10 @@ template using iHalfSpinColourVector = iScalar using iSpinColourSpinColourMatrix = iScalar, Ns>, Nc>, Ns> >; +template using iGparityFlavourVector = iVector >, Ngp>; template using iGparitySpinColourVector = iVector, Ns>, Ngp >; template using iGparityHalfSpinColourVector = iVector, Nhs>, Ngp >; +template using iGparityFlavourMatrix = iMatrix >, Ngp>; // Spin matrix typedef iSpinMatrix SpinMatrix; @@ -176,6 +181,16 @@ typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrix; typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixD; +//G-parity flavour matrix +typedef iGparityFlavourMatrix GparityFlavourMatrix; +typedef iGparityFlavourMatrix GparityFlavourMatrixF; +typedef iGparityFlavourMatrix GparityFlavourMatrixD; + +typedef iGparityFlavourMatrix vGparityFlavourMatrix; +typedef iGparityFlavourMatrix vGparityFlavourMatrixF; +typedef iGparityFlavourMatrix vGparityFlavourMatrixD; + + // Spin vector typedef iSpinVector SpinVector; typedef iSpinVector SpinVectorF; @@ -220,6 +235,16 @@ typedef iHalfSpinColourVector HalfSpinColourVectorD; typedef iHalfSpinColourVector vHalfSpinColourVector; typedef iHalfSpinColourVector vHalfSpinColourVectorF; typedef iHalfSpinColourVector vHalfSpinColourVectorD; + +//G-parity flavour vector +typedef iGparityFlavourVector GparityFlavourVector; +typedef iGparityFlavourVector GparityFlavourVectorF; +typedef iGparityFlavourVector GparityFlavourVectorD; + +typedef iGparityFlavourVector vGparityFlavourVector; +typedef iGparityFlavourVector vGparityFlavourVectorF; +typedef iGparityFlavourVector vGparityFlavourVectorD; + // singlets typedef iSinglet TComplex; // FIXME This is painful. Tensor singlet complex type. diff --git a/Grid/qcd/gparity/Gparity.h b/Grid/qcd/gparity/Gparity.h new file mode 100644 index 00000000..ce1c70eb --- /dev/null +++ b/Grid/qcd/gparity/Gparity.h @@ -0,0 +1,6 @@ +#ifndef GRID_GPARITY_H_ +#define GRID_GPARITY_H_ + +#include + +#endif diff --git a/Grid/qcd/gparity/GparityFlavour.cc b/Grid/qcd/gparity/GparityFlavour.cc new file mode 100644 index 00000000..4596f96b --- /dev/null +++ b/Grid/qcd/gparity/GparityFlavour.cc @@ -0,0 +1,34 @@ +#include + +NAMESPACE_BEGIN(Grid); + +const std::array GparityFlavour::sigma_mu = {{ + GparityFlavour(GparityFlavour::Algebra::SigmaX), + GparityFlavour(GparityFlavour::Algebra::SigmaY), + GparityFlavour(GparityFlavour::Algebra::SigmaZ) + }}; + +const std::array GparityFlavour::sigma_all = {{ + GparityFlavour(GparityFlavour::Algebra::Identity), + GparityFlavour(GparityFlavour::Algebra::SigmaX), + GparityFlavour(GparityFlavour::Algebra::SigmaY), + GparityFlavour(GparityFlavour::Algebra::SigmaZ), + GparityFlavour(GparityFlavour::Algebra::ProjPlus), + GparityFlavour(GparityFlavour::Algebra::ProjMinus) +}}; + +const std::array GparityFlavour::name = {{ + "SigmaX", + "MinusSigmaX", + "SigmaY", + "MinusSigmaY", + "SigmaZ", + "MinusSigmaZ", + "Identity", + "MinusIdentity", + "ProjPlus", + "MinusProjPlus", + "ProjMinus", + "MinusProjMinus"}}; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/gparity/GparityFlavour.h b/Grid/qcd/gparity/GparityFlavour.h new file mode 100644 index 00000000..b2009235 --- /dev/null +++ b/Grid/qcd/gparity/GparityFlavour.h @@ -0,0 +1,475 @@ +#ifndef GRID_QCD_GPARITY_FLAVOUR_H +#define GRID_QCD_GPARITY_FLAVOUR_H + +//Support for flavour-matrix operations acting on the G-parity flavour index + +#include + +NAMESPACE_BEGIN(Grid); + +class GparityFlavour { + public: + GRID_SERIALIZABLE_ENUM(Algebra, undef, + SigmaX, 0, + MinusSigmaX, 1, + SigmaY, 2, + MinusSigmaY, 3, + SigmaZ, 4, + MinusSigmaZ, 5, + Identity, 6, + MinusIdentity, 7, + ProjPlus, 8, + MinusProjPlus, 9, + ProjMinus, 10, + MinusProjMinus, 11 + ); + static constexpr unsigned int nSigma = 12; + static const std::array name; + static const std::array sigma_mu; + static const std::array sigma_all; + Algebra g; + public: + accelerator GparityFlavour(Algebra initg): g(initg) {} +}; + + + +// 0 1 x vector +// 1 0 +template +accelerator_inline void multFlavourSigmaX(iVector &ret, const iVector &rhs) +{ + ret(0) = rhs(1); + ret(1) = rhs(0); +}; +template +accelerator_inline void lmultFlavourSigmaX(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = rhs(1,0); + ret(0,1) = rhs(1,1); + ret(1,0) = rhs(0,0); + ret(1,1) = rhs(0,1); +}; +template +accelerator_inline void rmultFlavourSigmaX(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = rhs(0,1); + ret(0,1) = rhs(0,0); + ret(1,0) = rhs(1,1); + ret(1,1) = rhs(1,0); +}; + + +template +accelerator_inline void multFlavourMinusSigmaX(iVector &ret, const iVector &rhs) +{ + ret(0) = -rhs(1); + ret(1) = -rhs(0); +}; +template +accelerator_inline void lmultFlavourMinusSigmaX(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = -rhs(1,0); + ret(0,1) = -rhs(1,1); + ret(1,0) = -rhs(0,0); + ret(1,1) = -rhs(0,1); +}; +template +accelerator_inline void rmultFlavourMinusSigmaX(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = -rhs(0,1); + ret(0,1) = -rhs(0,0); + ret(1,0) = -rhs(1,1); + ret(1,1) = -rhs(1,0); +}; + + + + + +// 0 -i x vector +// i 0 +template +accelerator_inline void multFlavourSigmaY(iVector &ret, const iVector &rhs) +{ + ret(0) = timesMinusI(rhs(1)); + ret(1) = timesI(rhs(0)); +}; +template +accelerator_inline void lmultFlavourSigmaY(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = timesMinusI(rhs(1,0)); + ret(0,1) = timesMinusI(rhs(1,1)); + ret(1,0) = timesI(rhs(0,0)); + ret(1,1) = timesI(rhs(0,1)); +}; +template +accelerator_inline void rmultFlavourSigmaY(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = timesI(rhs(0,1)); + ret(0,1) = timesMinusI(rhs(0,0)); + ret(1,0) = timesI(rhs(1,1)); + ret(1,1) = timesMinusI(rhs(1,0)); +}; + +template +accelerator_inline void multFlavourMinusSigmaY(iVector &ret, const iVector &rhs) +{ + ret(0) = timesI(rhs(1)); + ret(1) = timesMinusI(rhs(0)); +}; +template +accelerator_inline void lmultFlavourMinusSigmaY(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = timesI(rhs(1,0)); + ret(0,1) = timesI(rhs(1,1)); + ret(1,0) = timesMinusI(rhs(0,0)); + ret(1,1) = timesMinusI(rhs(0,1)); +}; +template +accelerator_inline void rmultFlavourMinusSigmaY(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = timesMinusI(rhs(0,1)); + ret(0,1) = timesI(rhs(0,0)); + ret(1,0) = timesMinusI(rhs(1,1)); + ret(1,1) = timesI(rhs(1,0)); +}; + + + + + +// 1 0 x vector +// 0 -1 +template +accelerator_inline void multFlavourSigmaZ(iVector &ret, const iVector &rhs) +{ + ret(0) = rhs(0); + ret(1) = -rhs(1); +}; +template +accelerator_inline void lmultFlavourSigmaZ(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = rhs(0,0); + ret(0,1) = rhs(0,1); + ret(1,0) = -rhs(1,0); + ret(1,1) = -rhs(1,1); +}; +template +accelerator_inline void rmultFlavourSigmaZ(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = rhs(0,0); + ret(0,1) = -rhs(0,1); + ret(1,0) = rhs(1,0); + ret(1,1) = -rhs(1,1); +}; + + +template +accelerator_inline void multFlavourMinusSigmaZ(iVector &ret, const iVector &rhs) +{ + ret(0) = -rhs(0); + ret(1) = rhs(1); +}; +template +accelerator_inline void lmultFlavourMinusSigmaZ(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = -rhs(0,0); + ret(0,1) = -rhs(0,1); + ret(1,0) = rhs(1,0); + ret(1,1) = rhs(1,1); +}; +template +accelerator_inline void rmultFlavourMinusSigmaZ(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = -rhs(0,0); + ret(0,1) = rhs(0,1); + ret(1,0) = -rhs(1,0); + ret(1,1) = rhs(1,1); +}; + + + + + + +template +accelerator_inline void multFlavourIdentity(iVector &ret, const iVector &rhs) +{ + ret(0) = rhs(0); + ret(1) = rhs(1); +}; +template +accelerator_inline void lmultFlavourIdentity(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = rhs(0,0); + ret(0,1) = rhs(0,1); + ret(1,0) = rhs(1,0); + ret(1,1) = rhs(1,1); +}; +template +accelerator_inline void rmultFlavourIdentity(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = rhs(0,0); + ret(0,1) = rhs(0,1); + ret(1,0) = rhs(1,0); + ret(1,1) = rhs(1,1); +}; + +template +accelerator_inline void multFlavourMinusIdentity(iVector &ret, const iVector &rhs) +{ + ret(0) = -rhs(0); + ret(1) = -rhs(1); +}; +template +accelerator_inline void lmultFlavourMinusIdentity(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = -rhs(0,0); + ret(0,1) = -rhs(0,1); + ret(1,0) = -rhs(1,0); + ret(1,1) = -rhs(1,1); +}; +template +accelerator_inline void rmultFlavourMinusIdentity(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = -rhs(0,0); + ret(0,1) = -rhs(0,1); + ret(1,0) = -rhs(1,0); + ret(1,1) = -rhs(1,1); +}; + + + + + +//G-parity flavour projection 1/2(1+\sigma_2) +//1 -i +//i 1 +template +accelerator_inline void multFlavourProjPlus(iVector &ret, const iVector &rhs) +{ + ret(0) = 0.5*rhs(0) + 0.5*timesMinusI(rhs(1)); + ret(1) = 0.5*timesI(rhs(0)) + 0.5*rhs(1); +}; +template +accelerator_inline void lmultFlavourProjPlus(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0)); + ret(0,1) = 0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1)); + ret(1,0) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(1,0); + ret(1,1) = 0.5*timesI(rhs(0,1)) + 0.5*rhs(1,1); +}; +template +accelerator_inline void rmultFlavourProjPlus(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(0,1)); + ret(0,1) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(0,1); + ret(1,0) = 0.5*rhs(1,0) + 0.5*timesI(rhs(1,1)); + ret(1,1) = 0.5*timesMinusI(rhs(1,0)) + 0.5*rhs(1,1); +}; + + +template +accelerator_inline void multFlavourMinusProjPlus(iVector &ret, const iVector &rhs) +{ + ret(0) = -0.5*rhs(0) + 0.5*timesI(rhs(1)); + ret(1) = 0.5*timesMinusI(rhs(0)) - 0.5*rhs(1); +}; +template +accelerator_inline void lmultFlavourMinusProjPlus(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(1,0)); + ret(0,1) = -0.5*rhs(0,1) + 0.5*timesI(rhs(1,1)); + ret(1,0) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(1,0); + ret(1,1) = 0.5*timesMinusI(rhs(0,1)) - 0.5*rhs(1,1); +}; +template +accelerator_inline void rmultFlavourMinusProjPlus(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1)); + ret(0,1) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(0,1); + ret(1,0) = -0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1)); + ret(1,1) = 0.5*timesI(rhs(1,0)) - 0.5*rhs(1,1); +}; + + + + + +//G-parity flavour projection 1/2(1-\sigma_2) +//1 i +//-i 1 +template +accelerator_inline void multFlavourProjMinus(iVector &ret, const iVector &rhs) +{ + ret(0) = 0.5*rhs(0) + 0.5*timesI(rhs(1)); + ret(1) = 0.5*timesMinusI(rhs(0)) + 0.5*rhs(1); +}; +template +accelerator_inline void lmultFlavourProjMinus(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = 0.5*rhs(0,0) + 0.5*timesI(rhs(1,0)); + ret(0,1) = 0.5*rhs(0,1) + 0.5*timesI(rhs(1,1)); + ret(1,0) = 0.5*timesMinusI(rhs(0,0)) + 0.5*rhs(1,0); + ret(1,1) = 0.5*timesMinusI(rhs(0,1)) + 0.5*rhs(1,1); +}; +template +accelerator_inline void rmultFlavourProjMinus(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = 0.5*rhs(0,0) + 0.5*timesMinusI(rhs(0,1)); + ret(0,1) = 0.5*timesI(rhs(0,0)) + 0.5*rhs(0,1); + ret(1,0) = 0.5*rhs(1,0) + 0.5*timesMinusI(rhs(1,1)); + ret(1,1) = 0.5*timesI(rhs(1,0)) + 0.5*rhs(1,1); +}; + + +template +accelerator_inline void multFlavourMinusProjMinus(iVector &ret, const iVector &rhs) +{ + ret(0) = -0.5*rhs(0) + 0.5*timesMinusI(rhs(1)); + ret(1) = 0.5*timesI(rhs(0)) - 0.5*rhs(1); +}; +template +accelerator_inline void lmultFlavourMinusProjMinus(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = -0.5*rhs(0,0) + 0.5*timesMinusI(rhs(1,0)); + ret(0,1) = -0.5*rhs(0,1) + 0.5*timesMinusI(rhs(1,1)); + ret(1,0) = 0.5*timesI(rhs(0,0)) - 0.5*rhs(1,0); + ret(1,1) = 0.5*timesI(rhs(0,1)) - 0.5*rhs(1,1); +}; +template +accelerator_inline void rmultFlavourMinusProjMinus(iMatrix &ret, const iMatrix &rhs) +{ + ret(0,0) = -0.5*rhs(0,0) + 0.5*timesI(rhs(0,1)); + ret(0,1) = 0.5*timesMinusI(rhs(0,0)) - 0.5*rhs(0,1); + ret(1,0) = -0.5*rhs(1,0) + 0.5*timesI(rhs(1,1)); + ret(1,1) = 0.5*timesMinusI(rhs(1,0)) - 0.5*rhs(1,1); +}; + + + + + + + + + + +template +accelerator_inline auto operator*(const GparityFlavour &G, const iVector &arg) +->typename std::enable_if, GparityFlavourTensorIndex>::value, iVector>::type +{ + iVector ret; + + switch (G.g) + { + case GparityFlavour::Algebra::SigmaX: + multFlavourSigmaX(ret, arg); break; + case GparityFlavour::Algebra::MinusSigmaX: + multFlavourMinusSigmaX(ret, arg); break; + case GparityFlavour::Algebra::SigmaY: + multFlavourSigmaY(ret, arg); break; + case GparityFlavour::Algebra::MinusSigmaY: + multFlavourMinusSigmaY(ret, arg); break; + case GparityFlavour::Algebra::SigmaZ: + multFlavourSigmaZ(ret, arg); break; + case GparityFlavour::Algebra::MinusSigmaZ: + multFlavourMinusSigmaZ(ret, arg); break; + case GparityFlavour::Algebra::Identity: + multFlavourIdentity(ret, arg); break; + case GparityFlavour::Algebra::MinusIdentity: + multFlavourMinusIdentity(ret, arg); break; + case GparityFlavour::Algebra::ProjPlus: + multFlavourProjPlus(ret, arg); break; + case GparityFlavour::Algebra::MinusProjPlus: + multFlavourMinusProjPlus(ret, arg); break; + case GparityFlavour::Algebra::ProjMinus: + multFlavourProjMinus(ret, arg); break; + case GparityFlavour::Algebra::MinusProjMinus: + multFlavourMinusProjMinus(ret, arg); break; + default: assert(0); + } + + return ret; +} + +template +accelerator_inline auto operator*(const GparityFlavour &G, const iMatrix &arg) +->typename std::enable_if, GparityFlavourTensorIndex>::value, iMatrix>::type +{ + iMatrix ret; + + switch (G.g) + { + case GparityFlavour::Algebra::SigmaX: + lmultFlavourSigmaX(ret, arg); break; + case GparityFlavour::Algebra::MinusSigmaX: + lmultFlavourMinusSigmaX(ret, arg); break; + case GparityFlavour::Algebra::SigmaY: + lmultFlavourSigmaY(ret, arg); break; + case GparityFlavour::Algebra::MinusSigmaY: + lmultFlavourMinusSigmaY(ret, arg); break; + case GparityFlavour::Algebra::SigmaZ: + lmultFlavourSigmaZ(ret, arg); break; + case GparityFlavour::Algebra::MinusSigmaZ: + lmultFlavourMinusSigmaZ(ret, arg); break; + case GparityFlavour::Algebra::Identity: + lmultFlavourIdentity(ret, arg); break; + case GparityFlavour::Algebra::MinusIdentity: + lmultFlavourMinusIdentity(ret, arg); break; + case GparityFlavour::Algebra::ProjPlus: + lmultFlavourProjPlus(ret, arg); break; + case GparityFlavour::Algebra::MinusProjPlus: + lmultFlavourMinusProjPlus(ret, arg); break; + case GparityFlavour::Algebra::ProjMinus: + lmultFlavourProjMinus(ret, arg); break; + case GparityFlavour::Algebra::MinusProjMinus: + lmultFlavourMinusProjMinus(ret, arg); break; + default: assert(0); + } + + return ret; +} + +template +accelerator_inline auto operator*(const iMatrix &arg, const GparityFlavour &G) +->typename std::enable_if, GparityFlavourTensorIndex>::value, iMatrix>::type +{ + iMatrix ret; + + switch (G.g) + { + case GparityFlavour::Algebra::SigmaX: + rmultFlavourSigmaX(ret, arg); break; + case GparityFlavour::Algebra::MinusSigmaX: + rmultFlavourMinusSigmaX(ret, arg); break; + case GparityFlavour::Algebra::SigmaY: + rmultFlavourSigmaY(ret, arg); break; + case GparityFlavour::Algebra::MinusSigmaY: + rmultFlavourMinusSigmaY(ret, arg); break; + case GparityFlavour::Algebra::SigmaZ: + rmultFlavourSigmaZ(ret, arg); break; + case GparityFlavour::Algebra::MinusSigmaZ: + rmultFlavourMinusSigmaZ(ret, arg); break; + case GparityFlavour::Algebra::Identity: + rmultFlavourIdentity(ret, arg); break; + case GparityFlavour::Algebra::MinusIdentity: + rmultFlavourMinusIdentity(ret, arg); break; + case GparityFlavour::Algebra::ProjPlus: + rmultFlavourProjPlus(ret, arg); break; + case GparityFlavour::Algebra::MinusProjPlus: + rmultFlavourMinusProjPlus(ret, arg); break; + case GparityFlavour::Algebra::ProjMinus: + rmultFlavourProjMinus(ret, arg); break; + case GparityFlavour::Algebra::MinusProjMinus: + rmultFlavourMinusProjMinus(ret, arg); break; + default: assert(0); + } + + return ret; +} + +NAMESPACE_END(Grid); + +#endif // include guard diff --git a/tests/core/Test_gparity_flavour.cc b/tests/core/Test_gparity_flavour.cc new file mode 100644 index 00000000..5b204e04 --- /dev/null +++ b/tests/core/Test_gparity_flavour.cc @@ -0,0 +1,177 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_gparity_flavour.cc + +Copyright (C) 2015-2017 + +Author: Christopher Kelly +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + +static constexpr double tolerance = 1.0e-6; +static std::array testAlgebra; + +void print(const GparityFlavourMatrix &g) +{ + for(int i = 0; i < Ngp; i++) + { + std::cout << GridLogMessage << "("; + for(int j=0;j testg; + const Complex I(0., 1.), mI(0., -1.); + + // 0 1 + // 1 0 + testg[0] = Zero(); + testg[0](0, 1)()() = 1.; + testg[0](1, 0)()() = 1.; + std::cout << GridLogMessage << "test SigmaX= " << std::endl; + print(testg[0]); + + // 0 -i + // i 0 + testg[1] = Zero(); + testg[1](0, 1)()() = mI; + testg[1](1, 0)()() = I; + std::cout << GridLogMessage << "test SigmaY= " << std::endl; + print(testg[1]); + + // 1 0 + // 0 -1 + testg[2] = Zero(); + testg[2](0, 0)()() = 1.0; + testg[2](1, 1)()() = -1.0; + std::cout << GridLogMessage << "test SigmaZ= " << std::endl; + print(testg[2]); + + +#define DEFINE_TEST_G(g, exp)\ +testAlgebra[GparityFlavour::Algebra::g] = exp; \ +testAlgebra[GparityFlavour::Algebra::Minus##g] = -exp; + + DEFINE_TEST_G(SigmaX , testg[0]); + DEFINE_TEST_G(SigmaY , testg[1]); + DEFINE_TEST_G(SigmaZ , testg[2]); + DEFINE_TEST_G(Identity , 1.); + + GparityFlavourMatrix pplus; + pplus = 1.0; + pplus = pplus + testg[1]; + pplus = pplus * 0.5; + + DEFINE_TEST_G(ProjPlus , pplus); + + GparityFlavourMatrix pminus; + pminus = 1.0; + pminus = pminus - testg[1]; + pminus = pminus * 0.5; + + DEFINE_TEST_G(ProjMinus , pminus); + +#undef DEFINE_TEST_G +} + +template +void test(const Expr &a, const Expr &b) +{ + if (norm2(a - b) < tolerance) + { + std::cout << "[OK] "; + } + else + { + std::cout << "[fail]" << std::endl; + std::cout << GridLogError << "a= " << a << std::endl; + std::cout << GridLogError << "is different (tolerance= " << tolerance << ") from " << std::endl; + std::cout << GridLogError << "b= " << b << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkSigma(const GparityFlavour::Algebra a, GridSerialRNG &rng) +{ + GparityFlavourVector v; + GparityFlavourMatrix m, &testg = testAlgebra[a]; + GparityFlavour g(a); + + random(rng, v); + random(rng, m); + + std::cout << GridLogMessage << "Checking " << GparityFlavour::name[a] << ": "; + std::cout << "vecmul "; + test(g*v, testg*v); + std::cout << "matlmul "; + test(g*m, testg*m); + std::cout << "matrmul "; + test(m*g, m*testg); + std::cout << std::endl; +} + +int main(int argc, char *argv[]) +{ + Grid_init(&argc,&argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridSerialRNG sRNG; + + sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + + std::cout << GridLogMessage << "======== Test algebra" << std::endl; + createTestAlgebra(); + std::cout << GridLogMessage << "======== Multiplication operators check" << std::endl; + for (int i = 0; i < GparityFlavour::nSigma; ++i) + { + checkSigma(i, sRNG); + } + std::cout << GridLogMessage << std::endl; + + Grid_finalize(); + + return EXIT_SUCCESS; +} From 81fe4c937e61a26c35db1a92ba055b18e15045eb Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 12 Apr 2022 09:51:59 -0400 Subject: [PATCH 017/240] Hopefully fix link errors on Intel compilers due to having no function body for MomentumFilterBase::apply_phase --- Grid/qcd/action/filters/MomentumFilter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/filters/MomentumFilter.h b/Grid/qcd/action/filters/MomentumFilter.h index 2a15d80c..864166f5 100644 --- a/Grid/qcd/action/filters/MomentumFilter.h +++ b/Grid/qcd/action/filters/MomentumFilter.h @@ -37,7 +37,7 @@ NAMESPACE_BEGIN(Grid); template struct MomentumFilterBase{ - virtual void applyFilter(MomentaField &P) const; + virtual void applyFilter(MomentaField &P) const = 0; }; //Do nothing From 6121397587adfbaf876ce80dc697b631c022929e Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Mon, 9 May 2022 16:27:57 -0400 Subject: [PATCH 018/240] Imported changes from feature/gparity_HMC branch: Added storage of final true residual in mixed-prec CG and enhanced log output Fixed const correctness of multi-shift constructor Added a mixed precision variant of the multi-shift algorithm that uses a single precision operator and applies periodic reliable update to the residual Added tests/solver/Test_dwf_multishift_mixedprec to test the above Fixed local coherence lanczos using the (large!) max approx to the chebyshev eval as the scale from which to judge the quality of convergence, resulting a test that always passes Added a method to local coherence lanczos class that returns the fine eval/evec pair Added iterative log output to power method Added optional disabling of the plaquette check in Nerscio to support loading old G-parity configs which have a factor of 2 error in the plaquette G-parity Dirac op no longer allows GPBC in the time direction; instead we toggle between periodic and antiperiodic Replaced thread_for G-parity 5D force insertion implementation with accelerator_for version capable of running on GPUs Generalized tests/lanczos/Test_dwf_lanczos to support regular DWF as well as Gparity, with the action chosen by a command line option Modified tests/forces/Test_dwf_gpforce,Test_gpdwf_force,Test_gpwilson_force to use GPBC a spatial direction rather than the t-direction, and antiperiodic BCs for time direction tests/core/Test_gparity now supports using APBC in time direction using command line toggle --- Grid/algorithms/Algorithms.h | 1 + .../iterative/ConjugateGradientMixedPrec.h | 5 + .../iterative/ConjugateGradientMultiShift.h | 5 +- .../ConjugateGradientMultiShiftMixedPrec.h | 409 ++++++++++++++++++ .../iterative/LocalCoherenceLanczos.h | 48 +- Grid/algorithms/iterative/PowerMethod.h | 2 + Grid/parallelIO/NerscIO.h | 6 +- Grid/qcd/action/fermion/GparityWilsonImpl.h | 136 ++++-- tests/core/Test_gparity.cc | 36 +- tests/forces/Test_dwf_gpforce.cc | 18 +- tests/forces/Test_gpdwf_force.cc | 6 +- tests/forces/Test_gpwilson_force.cc | 8 +- tests/lanczos/Test_dwf_lanczos.cc | 81 ++-- tests/solver/Test_dwf_multishift_mixedprec.cc | 184 ++++++++ 14 files changed, 852 insertions(+), 93 deletions(-) create mode 100644 Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h create mode 100644 tests/solver/Test_dwf_multishift_mixedprec.cc diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 7f27784b..47a0a92b 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB); #include #include #include +#include #include #include #include diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h index cd2e4374..31ac55e0 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h @@ -49,6 +49,7 @@ NAMESPACE_BEGIN(Grid); Integer TotalInnerIterations; //Number of inner CG iterations Integer TotalOuterIterations; //Number of restarts Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + RealD TrueResidual; //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess LinearFunction *guesser; @@ -68,6 +69,7 @@ NAMESPACE_BEGIN(Grid); } void operator() (const FieldD &src_d_in, FieldD &sol_d){ + std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl; TotalInnerIterations = 0; GridStopWatch TotalTimer; @@ -97,6 +99,7 @@ NAMESPACE_BEGIN(Grid); FieldF sol_f(SinglePrecGrid); sol_f.Checkerboard() = cb; + std::cout< CG_f(inner_tol, MaxInnerIterations); CG_f.ErrorOnNoConverge = false; @@ -130,6 +133,7 @@ NAMESPACE_BEGIN(Grid); (*guesser)(src_f, sol_f); //Inner CG + std::cout< CG_d(Tolerance, MaxInnerIterations); CG_d(Linop_d, src_d_in, sol_d); TotalFinalStepIterations = CG_d.IterationsToComplete; + TrueResidual = CG_d.TrueResidual; TotalTimer.Stop(); std::cout< TrueResidualShift; - ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : + ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : MaxIterations(maxit), shifts(_shifts) { @@ -182,6 +182,9 @@ public: for(int s=0;s +class ShiftedLinop: public LinearOperatorBase{ +public: + LinearOperatorBase &linop_base; + RealD shift; + + ShiftedLinop(LinearOperatorBase &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){} + + void OpDiag (const Field &in, Field &out){ assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); } + void OpDirAll (const Field &in, std::vector &out){ assert(0); } + + void Op (const Field &in, Field &out){ assert(0); } + void AdjOp (const Field &in, Field &out){ assert(0); } + + void HermOp(const Field &in, Field &out){ + linop_base.HermOp(in, out); + axpy(out, shift, in, out); + } + + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + HermOp(in,out); + ComplexD dot = innerProduct(in,out); + n1=real(dot); + n2=norm2(out); + } +}; +}; + + +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction, + public OperatorFunction +{ +public: + + using OperatorFunction::operator(); + + RealD Tolerance; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + std::vector IterationsToCompleteShift; // Iterations for this shift + int verbose; + MultiShiftFunction shifts; + std::vector TrueResidualShift; + + int ReliableUpdateFreq; //number of iterations between reliable updates + + GridBase* SinglePrecGrid; //Grid for single-precision fields + LinearOperatorBase &Linop_f; //single precision + + ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts, + GridBase* _SinglePrecGrid, LinearOperatorBase &_Linop_f, + int _ReliableUpdateFreq + ) : + MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq) + { + verbose=1; + IterationsToCompleteShift.resize(_shifts.order); + TrueResidualShift.resize(_shifts.order); + } + + void operator() (LinearOperatorBase &Linop, const FieldD &src, FieldD &psi) + { + GridBase *grid = src.Grid(); + int nshift = shifts.order; + std::vector results(nshift,grid); + (*this)(Linop,src,results,psi); + } + void operator() (LinearOperatorBase &Linop, const FieldD &src, std::vector &results, FieldD &psi) + { + int nshift = shifts.order; + + (*this)(Linop,src,results); + + psi = shifts.norm*src; + for(int i=0;i &Linop_d, const FieldD &src_d, std::vector &psi_d) + { + GridBase *DoublePrecGrid = src_d.Grid(); + + //////////////////////////////////////////////////////////////////////// + // Convenience references to the info stored in "MultiShiftFunction" + //////////////////////////////////////////////////////////////////////// + int nshift = shifts.order; + + std::vector &mass(shifts.poles); // Make references to array in "shifts" + std::vector &mresidual(shifts.tolerances); + std::vector alpha(nshift,1.0); + + //Double precision search directions + FieldD p_d(DoublePrecGrid); + std::vector ps_d(nshift, DoublePrecGrid);// Search directions (double precision) + + FieldD tmp_d(DoublePrecGrid); + FieldD r_d(DoublePrecGrid); + FieldD mmp_d(DoublePrecGrid); + + assert(psi_d.size()==nshift); + assert(mass.size()==nshift); + assert(mresidual.size()==nshift); + + // dynamic sized arrays on stack; 2d is a pain with vector + RealD bs[nshift]; + RealD rsq[nshift]; + RealD z[nshift][2]; + int converged[nshift]; + + const int primary =0; + + //Primary shift fields CG iteration + RealD a,b,c,d; + RealD cp,bp,qq; //prev + + // Matrix mult fields + FieldF r_f(SinglePrecGrid); + FieldF p_f(SinglePrecGrid); + FieldF tmp_f(SinglePrecGrid); + FieldF mmp_f(SinglePrecGrid); + FieldF src_f(SinglePrecGrid); + precisionChange(src_f, src_d); + + // Check lightest mass + for(int s=0;s= mass[primary] ); + converged[s]=0; + } + + // Wire guess to zero + // Residuals "r" are src + // First search direction "p" is also src + cp = norm2(src_d); + + // Handle trivial case of zero src. + if( cp == 0. ){ + for(int s=0;s= rsq[s]){ + CleanupTimer.Start(); + std::cout< Linop_shift_d(Linop_d, mass[s]); + ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop Linop_shift_f(Linop_f, mass[s]); + + MixedPrecisionConjugateGradient cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); + cg(src_d, psi_d[s]); + + TrueResidualShift[s] = cg.TrueResidual; + CleanupTimer.Stop(); + } + } + + std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"< nbasis ) eresid = eresid*_coarse_relax_tol; + std::cout.precision(13); std::cout< nbasis ) eresid = eresid*_coarse_relax_tol; if( (vv on the coarse grid. This function orthnormalizes the fine-grid subspace + //vectors under the block inner product. This step must be performed after computing the fine grid + //eigenvectors and before computing the coarse grid eigenvectors. void Orthogonalise(void ) { CoarseScalar InnerProd(_CoarseGrid); std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"< Cheby(cheby_op); - ProjectedHermOp Op(_FineOp,subspace); - ProjectedFunctionHermOp ChebyOp (Cheby,_FineOp,subspace); + Chebyshev Cheby(cheby_op); //Chebyshev of fine operator on fine grid + ProjectedHermOp Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion + ProjectedFunctionHermOp ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion ////////////////////////////////////////////////////////////////////////////////////////////////// // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL ////////////////////////////////////////////////////////////////////////////////////////////////// - Chebyshev ChebySmooth(cheby_smooth); - ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); + Chebyshev ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); evals_coarse.resize(Nm); evec_coarse.resize(Nm,_CoarseGrid); CoarseField src(_CoarseGrid); src=1.0; + //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array ImplicitlyRestartedLanczos IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); int Nconv=0; IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); @@ -405,6 +427,14 @@ public: std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl; } } + + //Get the fine eigenvector 'i' by reconstruction + void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{ + blockPromote(evec_coarse[i],evec,subspace); + eval = evals_coarse[i]; + } + + }; NAMESPACE_END(Grid); diff --git a/Grid/algorithms/iterative/PowerMethod.h b/Grid/algorithms/iterative/PowerMethod.h index 6aa8e923..027ea68c 100644 --- a/Grid/algorithms/iterative/PowerMethod.h +++ b/Grid/algorithms/iterative/PowerMethod.h @@ -29,6 +29,8 @@ template class PowerMethod RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. RealD vden = norm2(src_n); RealD na = vnum/vden; + + std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl; if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { evalMaxApprox = na; diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h index 99011e25..88278131 100644 --- a/Grid/parallelIO/NerscIO.h +++ b/Grid/parallelIO/NerscIO.h @@ -39,9 +39,11 @@ using namespace Grid; //////////////////////////////////////////////////////////////////////////////// class NerscIO : public BinaryIO { public: - typedef Lattice GaugeField; + // Enable/disable exiting if the plaquette in the header does not match the value computed (default true) + static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; } + static inline void truncate(std::string file){ std::ofstream fout(file,std::ios::out); } @@ -198,7 +200,7 @@ public: std::cerr << " nersc_csum " < class GparityWilsonImpl : public ConjugateGaugeImpl > { public: @@ -113,7 +125,7 @@ public: || ((distance== 1)&&(icoor[direction]==1)) || ((distance==-1)&&(icoor[direction]==0)); - permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world + permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction //Apply the links int f_upper = permute_lane ? 1 : 0; @@ -139,10 +151,10 @@ public: assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code assert((sl == 1) || (sl == 2)); - if ( SE->_around_the_world && St.parameters.twists[mmu] ) { - + //If this site is an global boundary site, perform the G-parity flavor twist + if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) { if ( sl == 2 ) { - + //Only do the twist for lanes on the edge of the physical node ExtractBuffer vals(Nsimd); extract(chi,vals); @@ -197,6 +209,19 @@ public: reg = memory; } + + //Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds + inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){ + autoView(poke_f0_v, poke_f0, CpuRead); + autoView(poke_f1_v, poke_f1, CpuRead); + autoView(Uds_v, Uds, CpuWrite); + thread_foreach(ss,poke_f0_v,{ + Uds_v[ss](0)(mu) = poke_f0_v[ss](); + Uds_v[ss](1)(mu) = poke_f1_v[ss](); + }); + } + + inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) { conformable(Uds.Grid(),GaugeGrid); @@ -207,14 +232,19 @@ public: GaugeLinkField Uconj(GaugeGrid); Lattice > coor(GaugeGrid); - - for(int mu=0;mu(Umu,mu); Uconj = conjugate(U); + // Implement the isospin rotation sign on the boundary between f=1 and f=0 // This phase could come from a simple bc 1,1,-1,1 .. int neglink = GaugeGrid->GlobalDimensions()[mu]-1; if ( Params.twists[mu] ) { @@ -229,7 +259,7 @@ public: thread_foreach(ss,U_v,{ Uds_v[ss](0)(mu) = U_v[ss](); Uds_v[ss](1)(mu) = Uconj_v[ss](); - }); + }); } U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary @@ -260,6 +290,38 @@ public: }); } } + + { //periodic / antiperiodic temporal BCs + int mu = Nd-1; + int L = GaugeGrid->GlobalDimensions()[mu]; + int Lmu = L - 1; + + LatticeCoordinate(coor, mu); + + U = PeekIndex(Umu, mu); //Get t-directed links + + GaugeLinkField *Upoke = &U; + + if(Params.twists[mu]){ //antiperiodic + Utmp = where(coor == Lmu, -U, U); + Upoke = &Utmp; + } + + Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links + pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu); + + //Get the barrel-shifted field + Utmp = adj(Cshift(U, mu, -1)); //is a forward shift! + Upoke = &Utmp; + + if(Params.twists[mu]){ + U = where(coor == 0, -Utmp, Utmp); //boundary phase + Upoke = &U; + } + + Uconj = conjugate(*Upoke); + pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4); + } } inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) { @@ -298,28 +360,48 @@ public: inline void extractLinkField(std::vector &mat, DoubledGaugeField &Uds){ assert(0); } - + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) { - - int Ls = Btilde.Grid()->_fdimensions[0]; - - GaugeLinkField tmp(mat.Grid()); - tmp = Zero(); + int Ls=Btilde.Grid()->_fdimensions[0]; + { - autoView( tmp_v , tmp, CpuWrite); - autoView( Atilde_v , Atilde, CpuRead); - autoView( Btilde_v , Btilde, CpuRead); - thread_for(ss,tmp.Grid()->oSites(),{ - for (int s = 0; s < Ls; s++) { - int sF = s + Ls * ss; - auto ttmp = traceIndex(outerProduct(Btilde_v[sF], Atilde_v[sF])); - tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); - } - }); + GridBase *GaugeGrid = mat.Grid(); + Lattice > coor(GaugeGrid); + + if( Params.twists[mu] ){ + LatticeCoordinate(coor,mu); + } + + autoView( mat_v , mat, AcceleratorWrite); + autoView( Btilde_v , Btilde, AcceleratorRead); + autoView( Atilde_v , Atilde, AcceleratorRead); + accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{ + int sU=sss; + typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType; + ColorMatrixType sum; + zeroit(sum); + for(int s=0;s(mat, tmp, mu); - return; } + + + + }; diff --git a/tests/core/Test_gparity.cc b/tests/core/Test_gparity.cc index b2068901..5bf98ba6 100644 --- a/tests/core/Test_gparity.cc +++ b/tests/core/Test_gparity.cc @@ -55,13 +55,17 @@ static_assert(same_vComplex == 1, "Dirac Operators must have same underlying SIM int main (int argc, char ** argv) { int nu = 0; - + int tbc_aprd = 0; //use antiperiodic BCs in the time direction? + Grid_init(&argc,&argv); for(int i=1;i> nu; std::cout << GridLogMessage << "Set Gparity direction to " << nu << std::endl; + }else if(std::string(argv[i]) == "--Tbc-APRD"){ + tbc_aprd = 1; + std::cout << GridLogMessage << "Using antiperiodic BCs in the time direction" << std::endl; } } @@ -155,13 +159,18 @@ int main (int argc, char ** argv) //Coordinate grid for reference LatticeInteger xcoor_1f5(FGrid_1f); - LatticeCoordinate(xcoor_1f5,1+nu); + LatticeCoordinate(xcoor_1f5,1+nu); //note '1+nu'! This is because for 5D fields the s-direction is direction 0 Replicate(src,src_1f); src_1f = where( xcoor_1f5 >= Integer(L), 2.0*src_1f,src_1f ); RealD mass=0.0; RealD M5=1.8; - StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS); + + //Standard Dirac op + AcceleratorVector bc_std(Nd, 1.0); + if(tbc_aprd) bc_std[Nd-1] = -1.; //antiperiodic time BC + StandardDiracOp::ImplParams std_params(bc_std); + StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS, std_params); StandardFermionField src_o_1f(FrbGrid_1f); StandardFermionField result_o_1f(FrbGrid_1f); @@ -172,9 +181,11 @@ int main (int argc, char ** argv) ConjugateGradient CG(1.0e-8,10000); CG(HermOpEO,src_o_1f,result_o_1f); - // const int nu = 3; + //Gparity Dirac op std::vector twists(Nd,0); twists[nu] = 1; + if(tbc_aprd) twists[Nd-1] = 1; + GparityDiracOp::ImplParams params; params.twists = twists; GparityDiracOp GPDdwf(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5 DOP_PARAMS,params); @@ -271,8 +282,11 @@ int main (int argc, char ** argv) std::cout << "2f cb "<(result_o_2f,0); - res1o = PeekIndex<0>(result_o_2f,1); + res0o = PeekIndex<0>(result_o_2f,0); //flavor 0, odd cb + res1o = PeekIndex<0>(result_o_2f,1); //flavor 1, odd cb std::cout << "res cb "<= Integer(L), replica1,replica0 ); replica0 = Zero(); setCheckerboard(replica0,result_o_1f); - std::cout << "Norm2 solutions is " < twists(Nd,0); // twists[nu] = 1; - // GparityDomainWallFermionR::ImplParams params; params.twists = twists; - // GparityDomainWallFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); - // DomainWallFermionR Dw (U, Grid,RBGrid,mass,M5); - - const int nu = 3; + const int nu = 0; //gparity direction std::vector twists(Nd,0); twists[nu] = 1; + twists[Nd-1] = 1; //antiperiodic in time GparityDomainWallFermionR::ImplParams params; params.twists = twists; - - /* - params.boundary_phases[0] = 1.0; - params.boundary_phases[1] = 1.0; - params.boundary_phases[2] = 1.0; - params.boundary_phases[3] =- 1.0; - */ - + GparityDomainWallFermionR Dw(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); Dw.M (phi,Mphi); diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc index d6744080..af1ce82b 100644 --- a/tests/forces/Test_gpdwf_force.cc +++ b/tests/forces/Test_gpdwf_force.cc @@ -71,8 +71,10 @@ int main (int argc, char ** argv) RealD mass=0.01; RealD M5=1.8; - const int nu = 3; - std::vector twists(Nd,0); twists[nu] = 1; + const int nu = 1; + std::vector twists(Nd,0); + twists[nu] = 1; + twists[3] = 1; GparityDomainWallFermionR::ImplParams params; params.twists = twists; GparityDomainWallFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); Ddwf.M (phi,Mphi); diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc index d731f27a..7ab2ddeb 100644 --- a/tests/forces/Test_gpwilson_force.cc +++ b/tests/forces/Test_gpwilson_force.cc @@ -64,8 +64,12 @@ int main (int argc, char ** argv) //////////////////////////////////// RealD mass=0.01; - const int nu = 3; - std::vector twists(Nd,0); twists[nu] = 1; + const int nu = 1; + const int Lnu=latt_size[nu]; + + std::vector twists(Nd,0); + twists[nu] = 1; + twists[3]=1; GparityWilsonFermionR::ImplParams params; params.twists = twists; GparityWilsonFermionR Wil(U,*UGrid,*UrbGrid,mass,params); Wil.M (phi,Mphi); diff --git a/tests/lanczos/Test_dwf_lanczos.cc b/tests/lanczos/Test_dwf_lanczos.cc index 00d29ec0..1fe29bb2 100644 --- a/tests/lanczos/Test_dwf_lanczos.cc +++ b/tests/lanczos/Test_dwf_lanczos.cc @@ -31,14 +31,38 @@ using namespace std; using namespace Grid; ; -typedef typename GparityDomainWallFermionR::FermionField FermionField; +template +struct Setup{}; -RealD AllZero(RealD x){ return 0.;} +template<> +struct Setup{ + static GparityMobiusFermionR* getAction(LatticeGaugeField &Umu, + GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){ + RealD mass=0.01; + RealD M5=1.8; + RealD mob_b=1.5; + GparityMobiusFermionD ::ImplParams params; + std::vector twists({1,1,1,0}); + params.twists = twists; + return new GparityMobiusFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params); + } +}; -int main (int argc, char ** argv) -{ - Grid_init(&argc,&argv); +template<> +struct Setup{ + static DomainWallFermionR* getAction(LatticeGaugeField &Umu, + GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){ + RealD mass=0.01; + RealD M5=1.8; + return new DomainWallFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + } +}; + + +template +void run(){ + typedef typename Action::FermionField FermionField; const int Ls=8; GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); @@ -56,24 +80,10 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4, Umu); - std::vector U(4,UGrid); - for(int mu=0;mu(Umu,mu); - } - - RealD mass=0.01; - RealD M5=1.8; - RealD mob_b=1.5; -// DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); - GparityMobiusFermionD ::ImplParams params; - std::vector twists({1,1,1,0}); - params.twists = twists; - GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params); - -// MdagMLinearOperator HermOp(Ddwf); -// SchurDiagTwoOperator HermOp(Ddwf); - SchurDiagTwoOperator HermOp(Ddwf); -// SchurDiagMooeeOperator HermOp(Ddwf); + Action *action = Setup::getAction(Umu,FGrid,FrbGrid,UGrid,UrbGrid); + + //MdagMLinearOperator HermOp(Ddwf); + SchurDiagTwoOperator HermOp(*action); const int Nstop = 30; const int Nk = 40; @@ -90,8 +100,7 @@ int main (int argc, char ** argv) PlainHermOp Op (HermOp); ImplicitlyRestartedLanczos IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt); - - + std::vector eval(Nm); FermionField src(FrbGrid); gaussian(RNG5rb,src); @@ -103,6 +112,28 @@ int main (int argc, char ** argv) int Nconv; IRL.calc(eval,evec,src,Nconv); + delete action; +} + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + std::string action = "GparityMobius"; + for(int i=1;i(); + }else if(action == "DWF"){ + run(); + }else{ + std::cout << "Unknown action" << std::endl; + exit(1); + } + Grid_finalize(); } diff --git a/tests/solver/Test_dwf_multishift_mixedprec.cc b/tests/solver/Test_dwf_multishift_mixedprec.cc new file mode 100644 index 00000000..bdede459 --- /dev/null +++ b/tests/solver/Test_dwf_multishift_mixedprec.cc @@ -0,0 +1,184 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_multishift_mixedprec.cc + + Copyright (C) 2015 + +Author: Christopher Kelly + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + +template +void run_test(int argc, char ** argv, const typename SpeciesD::ImplParams ¶ms){ + const int Ls = 16; + GridCartesian* UGrid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian* UrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_d); + GridCartesian* FGrid_d = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_d); + GridRedBlackCartesian* FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_d); + + GridCartesian* UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian* UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f); + GridCartesian* FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid_f); + GridRedBlackCartesian* FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid_f); + + typedef typename SpeciesD::FermionField FermionFieldD; + typedef typename SpeciesF::FermionField FermionFieldF; + + std::vector seeds4({1, 2, 3, 4}); + std::vector seeds5({5, 6, 7, 8}); + GridParallelRNG RNG5(FGrid_d); + RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid_d); + RNG4.SeedFixedIntegers(seeds4); + + FermionFieldD src_d(FGrid_d); + random(RNG5, src_d); + + LatticeGaugeFieldD Umu_d(UGrid_d); + + //CPS-created G-parity ensembles have a factor of 2 error in the plaquette that causes the read to fail unless we workaround it + bool gparity_plaquette_fix = false; + for(int i=1;i(Umu_d, metadata, file); + + if(gparity_plaquette_fix){ + metadata.plaquette *= 2.; //correct header value + + //Get the true plaquette + FieldMetaData tmp; + GaugeStatisticsType gs; gs(Umu_d, tmp); + + std::cout << "After correction: plaqs " << tmp.plaquette << " " << metadata.plaquette << std::endl; + assert(fabs(tmp.plaquette -metadata.plaquette ) < 1.0e-5 ); + } + + cfg_loaded=true; + break; + } + } + + if(!cfg_loaded) + SU::HotConfiguration(RNG4, Umu_d); + + LatticeGaugeFieldF Umu_f(UGrid_f); + precisionChange(Umu_f, Umu_d); + + std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; + + RealD mass = 0.01; + RealD M5 = 1.8; + SpeciesD Ddwf_d(Umu_d, *FGrid_d, *FrbGrid_d, *UGrid_d, *UrbGrid_d, mass, M5, params); + SpeciesF Ddwf_f(Umu_f, *FGrid_f, *FrbGrid_f, *UGrid_f, *UrbGrid_f, mass, M5, params); + + FermionFieldD src_o_d(FrbGrid_d); + pickCheckerboard(Odd, src_o_d, src_d); + + SchurDiagMooeeOperator HermOpEO_d(Ddwf_d); + SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); + + AlgRemez remez(1e-4, 64, 50); + int order = 15; + remez.generateApprox(order, 1, 2); //sqrt + + MultiShiftFunction shifts(remez, 1e-10, false); + + int relup_freq = 50; + double t1=usecond(); + ConjugateGradientMultiShiftMixedPrec mcg(10000, shifts, FrbGrid_f, HermOpEO_f, relup_freq); + + std::vector results_o_d(order, FrbGrid_d); + mcg(HermOpEO_d, src_o_d, results_o_d); + double t2=usecond(); + + //Crosscheck double and mixed prec results + ConjugateGradientMultiShift dmcg(10000, shifts); + std::vector results_o_d_2(order, FrbGrid_d); + dmcg(HermOpEO_d, src_o_d, results_o_d_2); + double t3=usecond(); + + std::cout << GridLogMessage << "Comparison of mixed prec results to double prec results |mixed - double|^2 :" << std::endl; + FermionFieldD tmp(FrbGrid_d); + for(int i=0;i= 0 && gpdir <= 2); //spatial! + gparity = true; + } + } + if(gparity){ + std::cout << "Running test with G-parity BCs in " << gpdir << " direction" << std::endl; + GparityWilsonImplParams params; + params.twists[gpdir] = 1; + + std::vector conj_dirs(Nd,0); + conj_dirs[gpdir] = 1; + ConjugateGimplD::setDirections(conj_dirs); + + run_test(argc,argv,params); + }else{ + std::cout << "Running test with periodic BCs" << std::endl; + WilsonImplParams params; + run_test(argc,argv,params); + } + + Grid_finalize(); +} From b8ee19691c31e54ecd7fb009828d2d805b0f6ca2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 17 May 2022 09:08:12 -0700 Subject: [PATCH 019/240] Updated config for PM --- systems/Perlmutter/config-command | 4 ++++ systems/Perlmutter/dwf4.slurm | 24 ++++++++++-------------- systems/Perlmutter/sourceme.sh | 2 +- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/systems/Perlmutter/config-command b/systems/Perlmutter/config-command index b399c535..b62704dc 100644 --- a/systems/Perlmutter/config-command +++ b/systems/Perlmutter/config-command @@ -1,9 +1,13 @@ +DIR=`pwd` +PREFIX=$DIR/../Prequisites/install/ ../../configure \ --enable-comms=mpi \ --enable-simd=GPU \ --enable-shm=nvlink \ --enable-gen-simd-width=64 \ --enable-accelerator=cuda \ + --disable-accelerator-cshift \ + --with-gmp=$PREFIX \ --disable-fermion-reps \ --disable-unified \ --disable-gparity \ diff --git a/systems/Perlmutter/dwf4.slurm b/systems/Perlmutter/dwf4.slurm index ba198595..f6d6a2a9 100644 --- a/systems/Perlmutter/dwf4.slurm +++ b/systems/Perlmutter/dwf4.slurm @@ -1,24 +1,20 @@ #!/bin/bash -#SBATCH -A mp13 +#SBATCH -A m3886_g #SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 0:20:00 -#SBATCH -n 16 +#SBATCH -q debug +#SBATCH -t 0:10:00 +#SBATCH -n 4 #SBATCH --ntasks-per-node=4 #SBATCH -c 32 #SBATCH --exclusive #SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=map_gpu:0,1,2,3 +#SBATCH --gpu-bind=none export SLURM_CPU_BIND="cores" -export MPICH_RDMA_ENABLED_CUDA=1 export MPICH_GPU_SUPPORT_ENABLED=1 -srun ./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.2 --accelerator-threads 8 > comms.4node - -OPT="--comms-overlap --comms-concurrent --shm-mpi 0" -srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt0 -srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt0 - +export MPICH_RDMA_ENABLED_CUDA=0 +export MPICH_GPU_IPC_ENABLED=0 +export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0 +export MPICH_GPU_NO_ASYNC_MEMCPY=1 OPT="--comms-overlap --comms-concurrent --shm-mpi 1" -srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt1 -srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt1 +srun ./benchmarks/Benchmark_ITT --mpi 2.1.1.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > ITT.log diff --git a/systems/Perlmutter/sourceme.sh b/systems/Perlmutter/sourceme.sh index 9359dea9..6d09b1c9 100644 --- a/systems/Perlmutter/sourceme.sh +++ b/systems/Perlmutter/sourceme.sh @@ -1,4 +1,4 @@ export CRAY_ACCEL_TARGET=nvidia80 -module load PrgEnv-gnu cpe-cuda cuda +module load PrgEnv-gnu cpe-cuda cudatoolkit/11.4 From aa008cbe99c3e7873da558539bab33a30e275f4f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 19 May 2022 16:44:39 -0700 Subject: [PATCH 020/240] Updated for new Dirichlet interface --- benchmarks/Benchmark_dwf_fp32.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index f79797fa..9695cf5a 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -249,8 +249,9 @@ void Benchmark(int Ls, Coordinate Dirichlet) if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" < Date: Thu, 19 May 2022 16:45:02 -0700 Subject: [PATCH 021/240] Dirichlet BCs --- Grid/qcd/action/ActionParams.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h index 0e6a11c6..180dbbbf 100644 --- a/Grid/qcd/action/ActionParams.h +++ b/Grid/qcd/action/ActionParams.h @@ -37,24 +37,32 @@ NAMESPACE_BEGIN(Grid); // These can move into a params header and be given MacroMagic serialisation struct GparityWilsonImplParams { Coordinate twists; - GparityWilsonImplParams() : twists(Nd, 0) {}; + Coordinate dirichlet; // Blocksize of dirichlet BCs + GparityWilsonImplParams() : twists(Nd, 0), dirichlet(Nd, 0) {}; }; struct WilsonImplParams { bool overlapCommsCompute; + Coordinate dirichlet; // Blocksize of dirichlet BCs AcceleratorVector twist_n_2pi_L; AcceleratorVector boundary_phases; WilsonImplParams() { + dirichlet.resize(Nd,0); boundary_phases.resize(Nd, 1.0); twist_n_2pi_L.resize(Nd, 0.0); }; WilsonImplParams(const AcceleratorVector phi) : boundary_phases(phi), overlapCommsCompute(false) { twist_n_2pi_L.resize(Nd, 0.0); + dirichlet.resize(Nd,0); } }; struct StaggeredImplParams { - StaggeredImplParams() {}; + Coordinate dirichlet; // Blocksize of dirichlet BCs + StaggeredImplParams() + { + dirichlet.resize(Nd,0); + }; }; struct OneFlavourRationalParams : Serializable { From 2594e3c23091897b03744f707a5b30227b2db328 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 19 May 2022 16:45:19 -0700 Subject: [PATCH 022/240] Dirichlet option --- Grid/stencil/SimpleCompressor.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index 1150b234..b36d954f 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -52,6 +52,11 @@ public: return arg; } }; +class SimpleStencilParams{ +public: + Coordinate dirichlet; + SimpleStencilParams() {}; +}; NAMESPACE_END(Grid); From b52e8ef65a42ecd10c97b0ea594d5a8ebe5a1a39 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 19 May 2022 16:45:41 -0700 Subject: [PATCH 023/240] Dirichlet changes --- examples/Example_Laplacian.cc | 9 +++++---- tests/Test_stencil.cc | 10 +++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/Example_Laplacian.cc b/examples/Example_Laplacian.cc index fa8466cf..77e5fa35 100644 --- a/examples/Example_Laplacian.cc +++ b/examples/Example_Laplacian.cc @@ -93,14 +93,14 @@ template class FreeLaplacianStencil : public SparseMatrixBase StencilImpl; + typedef CartesianStencil StencilImpl; GridBase *grid; StencilImpl Stencil; SimpleCompressor Compressor; FreeLaplacianStencil(GridBase *_grid) - : Stencil (_grid,6,Even,directions,displacements,0), grid(_grid) + : Stencil (_grid,6,Even,directions,displacements,SimpleStencilParams()), grid(_grid) { }; virtual GridBase *Grid(void) { return grid; }; @@ -168,7 +168,8 @@ public: typedef iImplDoubledGaugeField SiteDoubledGaugeField; typedef Lattice DoubledGaugeField; - typedef CartesianStencil StencilImpl; + typedef CartesianStencil StencilImpl; + SimpleStencilParams p; GridBase *grid; StencilImpl Stencil; @@ -177,7 +178,7 @@ public: CovariantLaplacianStencil(GaugeField &Umu) : grid(Umu.Grid()), - Stencil (grid,6,Even,directions,displacements,0), + Stencil (grid,6,Even,directions,displacements,p), Uds(grid) { for (int mu = 0; mu < Nd; mu++) { diff --git a/tests/Test_stencil.cc b/tests/Test_stencil.cc index 93402a1c..f17faa27 100644 --- a/tests/Test_stencil.cc +++ b/tests/Test_stencil.cc @@ -31,7 +31,6 @@ Author: paboyle using namespace std; using namespace Grid; - ; int main(int argc, char ** argv) { Grid_init(&argc, &argv); @@ -80,7 +79,8 @@ int main(int argc, char ** argv) { Foo=lex; } - typedef CartesianStencil Stencil; + typedef CartesianStencil Stencil; + SimpleStencilParams p; for(int dir=0;dir<4;dir++){ for(int disp=0;disp directions(npoint,dir); std::vector displacements(npoint,disp); - Stencil myStencil(&Fine,npoint,0,directions,displacements,0); + Stencil myStencil(&Fine,npoint,0,directions,displacements,p); Coordinate ocoor(4); for(int o=0;o directions(npoint,dir); std::vector displacements(npoint,disp); - Stencil EStencil(&rbFine,npoint,Even,directions,displacements,0); - Stencil OStencil(&rbFine,npoint,Odd,directions,displacements,0); + Stencil EStencil(&rbFine,npoint,Even,directions,displacements,p); + Stencil OStencil(&rbFine,npoint,Odd,directions,displacements,p); Coordinate ocoor(4); for(int o=0;o Date: Thu, 19 May 2022 19:17:11 -0700 Subject: [PATCH 024/240] Dirichlet improved --- Grid/stencil/Stencil.h | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 3e356a1c..791d79eb 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -131,7 +131,6 @@ class CartesianStencilAccelerator { int _checkerboard; int _npoints; // Move to template param? int _osites; - int _dirichlet; StencilVector _directions; StencilVector _distances; StencilVector _comms_send; @@ -650,16 +649,14 @@ public: /// Introduce a block structure and switch off comms on boundaries void DirichletBlock(const Coordinate &dirichlet_block) { - this->_dirichlet = 1; for(int ii=0;ii_npoints;ii++){ int dimension = this->_directions[ii]; int displacement = this->_distances[ii]; - int shift = displacement; int gd = _grid->_gdimensions[dimension]; int fd = _grid->_fdimensions[dimension]; int pd = _grid->_processors [dimension]; - int ld = gd/pd; int pc = _grid->_processor_coor[dimension]; + int ld = fd/pd; /////////////////////////////////////////// // Figure out dirichlet send and receive // on this leg of stencil. @@ -668,7 +665,7 @@ public: int block = dirichlet_block[dimension]; this->_comms_send[ii] = comm_dim; this->_comms_recv[ii] = comm_dim; - if ( block ) { + if ( block && comm_dim ) { assert(abs(displacement) < ld ); if( displacement > 0 ) { @@ -677,16 +674,16 @@ public: // | | | // noR // noS - if ( (ld*(pc+1) ) % block == 0 ) this->_comms_recv[ii] = 0; - if ( ( ld*pc ) % block == 0 ) this->_comms_send[ii] = 0; + if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_recv[ii] = 0; + if ( ( (ld*pc ) % block ) == 0 ) this->_comms_send[ii] = 0; } else { // High side, low side // | <--B--->| // | | | // noS // noR - if ( (ld*(pc+1) ) % block == 0 ) this->_comms_send[ii] = 0; - if ( ( ld*pc ) % block == 0 ) this->_comms_recv[ii] = 0; + if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0; + if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0; } } } @@ -698,7 +695,6 @@ public: const std::vector &distances, Parameters p) { - this->_dirichlet = 0; face_table_computed=0; _grid = grid; this->parameters=p; @@ -715,6 +711,8 @@ public: this->_comms_recv.resize(npoints); this->same_node.resize(npoints); + if ( p.dirichlet.size() ) DirichletBlock(p.dirichlet); // comms send/recv set up + _unified_buffer_size=0; surface_list.resize(0); @@ -1106,6 +1104,7 @@ public: Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; + int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes,cbmask); if ( (!duplicate) ) { // Force comms for now @@ -1126,7 +1125,7 @@ public: words,Decompressions); } u_comm_offset+=words; - } + } } return 0; } @@ -1206,8 +1205,8 @@ public: face_table[face_idx].size()*sizeof(face_table_host[0])); } - // if ( comms_send ) - Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); + if ( comms_send ) + Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); face_idx++; //spointers[0] -- low From 3f31afa4fc0c66282588dd5dbe40ae1b94ff5223 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 24 May 2022 18:18:51 -0700 Subject: [PATCH 025/240] Clean up verbose --- Grid/communicator/Communicator_mpi3.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index ecdf1e53..1e9e7840 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -390,16 +390,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); assert(shm!=NULL); - // std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl; acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); } } if ( CommunicatorPolicy == CommunicatorPolicySequential ) { this->StencilSendToRecvFromComplete(list,dir); + list.resize(0); } return off_node_bytes; From 47b4e914736740f914af421182757f21ece0eeb6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 24 May 2022 18:19:18 -0700 Subject: [PATCH 026/240] Verbose change --- Grid/qcd/action/filters/DirichletFilter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/filters/DirichletFilter.h b/Grid/qcd/action/filters/DirichletFilter.h index 95353dab..4571c1de 100644 --- a/Grid/qcd/action/filters/DirichletFilter.h +++ b/Grid/qcd/action/filters/DirichletFilter.h @@ -55,7 +55,7 @@ struct DirichletFilter: public MomentumFilterBase for(int mu=0;muGlobalDimensions()[mu] ) ) { // If costly could provide Grid earlier and precompute masks - std::cout << " Dirichlet in mu="<_simd_layout[dimension]>1 && (comm_dim); int rotate_dim = _grid->_simd_layout[dimension]>2; - this->_comms_send[ii] = comm_dim; - this->_comms_recv[ii] = comm_dim; - assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported int sshift[2]; @@ -909,25 +904,30 @@ public: } // Wrap locally dirichlet support case OR node local - if ( (offnode==0) || (comms_recv==0) ) { + if ( offnode==0 ) { int permute_slice=0; CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); - + } else { + if ( comms_recv==0 ) { + + int permute_slice=1; + CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); + + } else { + + ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase + + } + + } + + if ( offnode ) { int words = buffer_size; if (cbmask != 0x3) words=words>>1; - - // int rank = grid->_processor; - // int recv_from_rank; - // int xmit_to_rank; - - int unified_buffer_offset = _unified_buffer_size; _unified_buffer_size += words; - - ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase - } } } @@ -1058,8 +1058,6 @@ public: int comm_proc = ((x+sshift)/rd)%pd; if (comm_proc) { - - int words = buffer_size; if (cbmask != 0x3) words=words>>1; @@ -1067,64 +1065,69 @@ public: int bytes = words * compress.CommDatumSize(); int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - if ( !face_table_computed ) { - face_table.resize(face_idx+1); - std::vector > face_table_host ; - Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host); - face_table[face_idx].resize(face_table_host.size()); - acceleratorCopyToDevice(&face_table_host[0], - &face_table[face_idx][0], - face_table[face_idx].size()*sizeof(face_table_host[0])); - } + int comm_off = u_comm_offset; - // int rank = _grid->_processor; int recv_from_rank; int xmit_to_rank; + cobj *recv_buf; + cobj *send_buf; _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); assert (xmit_to_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank()); - cobj *recv_buf; - if ( compress.DecompressionStep() ) { - recv_buf=u_simd_recv_buf[0]; - } else { - recv_buf=this->u_recv_buf_p; + if( comms_send ) { + + if ( !face_table_computed ) { + face_table.resize(face_idx+1); + std::vector > face_table_host ; + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); + face_table[face_idx].resize(face_table_host.size()); + acceleratorCopyToDevice(&face_table_host[0], + &face_table[face_idx][0], + face_table[face_idx].size()*sizeof(face_table_host[0])); + } + + + if ( compress.DecompressionStep() ) { + recv_buf=u_simd_recv_buf[0]; + } else { + recv_buf=this->u_recv_buf_p; + } + + send_buf = this->u_send_buf_p; // Gather locally, must send + + //////////////////////////////////////////////////////// + // Gather locally + //////////////////////////////////////////////////////// + assert(send_buf!=NULL); + + Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so); } - - cobj *send_buf; - send_buf = this->u_send_buf_p; // Gather locally, must send - - //////////////////////////////////////////////////////// - // Gather locally - //////////////////////////////////////////////////////// - assert(send_buf!=NULL); - if ( comms_send ) - Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); - face_idx++; - - - int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes,cbmask); + int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask); if ( (!duplicate) ) { // Force comms for now /////////////////////////////////////////////////////////// // Build a list of things to do after we synchronise GPUs // Start comms now??? /////////////////////////////////////////////////////////// - AddPacket((void *)&send_buf[u_comm_offset], - (void *)&recv_buf[u_comm_offset], + AddPacket((void *)&send_buf[comm_off], + (void *)&recv_buf[comm_off], xmit_to_rank, comms_send, recv_from_rank, comms_recv, bytes); } - if ( compress.DecompressionStep() ) { - AddDecompress(&this->u_recv_buf_p[u_comm_offset], - &recv_buf[u_comm_offset], + if ( compress.DecompressionStep() && comms_recv ) { + AddDecompress(&this->u_recv_buf_p[comm_off], + &recv_buf[comm_off], words,Decompressions); } + u_comm_offset+=words; + face_idx++; + } } return 0; @@ -1154,7 +1157,6 @@ public: int permute_type=_grid->PermuteType(dimension); - // std::cout << "SimdNew permute type "< > face_table_host ; - Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host); + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); face_table[face_idx].resize(face_table_host.size()); acceleratorCopyToDevice(&face_table_host[0], &face_table[face_idx][0], @@ -1225,8 +1228,8 @@ public: int nbr_plane = nbr_ic; assert (sx == nbr_ox); - auto rp = &u_simd_recv_buf[i ][u_comm_offset]; - auto sp = &u_simd_send_buf[nbr_plane][u_comm_offset]; + auto rp = &u_simd_recv_buf[i ][comm_off]; + auto sp = &u_simd_send_buf[nbr_plane][comm_off]; if(nbr_proc){ @@ -1252,9 +1255,10 @@ public: } } - AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); + AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); u_comm_offset +=buffer_size; + } } return 0; From f9f05e995bfd57885261fde425338cfc6c2aa8e1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 24 May 2022 18:24:38 -0700 Subject: [PATCH 028/240] Update perlmutter --- systems/Perlmutter/config-command | 1 + 1 file changed, 1 insertion(+) diff --git a/systems/Perlmutter/config-command b/systems/Perlmutter/config-command index b62704dc..4f7ecee3 100644 --- a/systems/Perlmutter/config-command +++ b/systems/Perlmutter/config-command @@ -6,6 +6,7 @@ PREFIX=$DIR/../Prequisites/install/ --enable-shm=nvlink \ --enable-gen-simd-width=64 \ --enable-accelerator=cuda \ + --enable-setdevice \ --disable-accelerator-cshift \ --with-gmp=$PREFIX \ --disable-fermion-reps \ From d83beaa8900a45daff806d03e272cffe803f1e65 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 24 May 2022 18:25:00 -0700 Subject: [PATCH 029/240] Update perlmutter --- systems/Perlmutter/dwf4.slurm | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/systems/Perlmutter/dwf4.slurm b/systems/Perlmutter/dwf4.slurm index f6d6a2a9..8a37a266 100644 --- a/systems/Perlmutter/dwf4.slurm +++ b/systems/Perlmutter/dwf4.slurm @@ -2,19 +2,26 @@ #SBATCH -A m3886_g #SBATCH -C gpu #SBATCH -q debug -#SBATCH -t 0:10:00 +#SBATCH -t 0:20:00 +#SBATCH -c 32 +#SBATCH -N 1 #SBATCH -n 4 #SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --exclusive #SBATCH --gpus-per-task=1 +#SBATCH --exclusive #SBATCH --gpu-bind=none export SLURM_CPU_BIND="cores" export MPICH_GPU_SUPPORT_ENABLED=1 -export MPICH_RDMA_ENABLED_CUDA=0 -export MPICH_GPU_IPC_ENABLED=0 +export MPICH_RDMA_ENABLED_CUDA=1 +export MPICH_GPU_IPC_ENABLED=1 export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0 -export MPICH_GPU_NO_ASYNC_MEMCPY=1 -OPT="--comms-overlap --comms-concurrent --shm-mpi 1" -srun ./benchmarks/Benchmark_ITT --mpi 2.1.1.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > ITT.log +export MPICH_GPU_NO_ASYNC_MEMCPY=0 +#export MPICH_SMP_SINGLE_COPY_MODE=CMA + +OPT="--comms-overlap --shm-mpi 1" +VOL=64.64.32.32 +srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.1.1 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT +#srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.1.1.4 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT +#srun ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.8 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT + From 136d843ce7df2ad998ead7284225ea044329fbb9 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 25 May 2022 12:36:09 -0400 Subject: [PATCH 030/240] Crusher updates --- .../OneFlavourEvenOddRationalRatio.h | 17 ++++++---- .../pseudofermion/OneFlavourRationalRatio.h | 6 ++-- Grid/threads/Accelerator.cc | 26 +++++++-------- HMC/Mobius2p1f_DD_RHMC.cc | 12 ++++--- systems/Crusher/config-command | 7 ++-- systems/Crusher/dwf.slurm | 16 +++++---- systems/Crusher/dwf8.slurm | 33 ++++++++----------- systems/Crusher/sourceme.sh | 7 ++-- 8 files changed, 62 insertions(+), 62 deletions(-) diff --git a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h index 6752ea19..2c2402f8 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h @@ -53,6 +53,9 @@ NAMESPACE_BEGIN(Grid); MultiShiftFunction PowerQuarter; MultiShiftFunction PowerNegQuarter; + MultiShiftFunction MDPowerNegHalf; + MultiShiftFunction MDPowerQuarter; + private: FermionOperator & NumOp;// the basic operator @@ -81,11 +84,13 @@ NAMESPACE_BEGIN(Grid); remez.generateApprox(param.degree,1,2); PowerHalf.Init(remez,param.tolerance,false); PowerNegHalf.Init(remez,param.tolerance,true); + MDPowerNegHalf.Init(remez,param.mdtolerance,true); // MdagM^(+- 1/4) std::cout< MpvPhi_k (n_pv,NumOp.FermionRedBlackGrid()); std::vector MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid()); @@ -246,8 +251,8 @@ NAMESPACE_BEGIN(Grid); SchurDifferentiableOperator VdagV(NumOp); SchurDifferentiableOperator MdagM(DenOp); - ConjugateGradientMultiShift msCG_V(param.MaxIter,PowerQuarter); - ConjugateGradientMultiShift msCG_M(param.MaxIter,PowerNegHalf); + ConjugateGradientMultiShift msCG_V(param.MaxIter,MDPowerQuarter); + ConjugateGradientMultiShift msCG_M(param.MaxIter,MDPowerNegHalf); msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi); msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi); @@ -266,7 +271,7 @@ NAMESPACE_BEGIN(Grid); //(1) for(int k=0;k CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); //////////////////////////////////// // Collect actions //////////////////////////////////// ActionLevel Level1(1); ActionLevel Level2(4); - ActionLevel Level3(6); + ActionLevel Level3(8); //////////////////////////////////// // Strange action @@ -226,7 +228,7 @@ int main(int argc, char **argv) { Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params)); Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params)); if(h!=0) { - Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction(*Numerators[h],*Denominators[h],CG,CG)); + Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction(*Numerators[h],*Denominators[h],MDCG,CG)); } else { Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); @@ -241,7 +243,7 @@ int main(int argc, char **argv) { for(int h=0;h dwf.64.64.64.256.8node +srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.${vol}.8node.shm-mpi1 +done - -PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 1" +PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1" echo $PARAMS -srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node +srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node -PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 0" +PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0" echo $PARAMS -#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node.shm0 - -PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1" -echo $PARAMS -#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node - -PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0" -echo $PARAMS -#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0 - - +srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0 diff --git a/systems/Crusher/sourceme.sh b/systems/Crusher/sourceme.sh index 83bfe57c..051014dc 100644 --- a/systems/Crusher/sourceme.sh +++ b/systems/Crusher/sourceme.sh @@ -1,6 +1,9 @@ module load PrgEnv-gnu -module load rocm/4.5.0 +module load rocm/5.1.0 +module load cray-mpich/8.1.15 module load gmp -module load cray-fftw +#module load cray-fftw module load craype-accel-amd-gfx90a export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH +#Hack for lib +export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH From 4f997c5f04d469b69e2f62691b0a672cb1de7414 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 25 May 2022 11:15:25 -0700 Subject: [PATCH 031/240] Remove extra face kernels in Dirichlet --- Grid/stencil/Stencil.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index e2f17d15..07598265 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -1255,7 +1255,9 @@ public: } } - AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); + if ( comms_recv ) { + AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); + } u_comm_offset +=buffer_size; From 7eb29cf5292b9889427bd9afde7e9a95737b1ae5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 28 May 2022 15:51:34 -0700 Subject: [PATCH 032/240] MPI fix --- Grid/communicator/Communicator_mpi3.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 1e9e7840..0d0a3443 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -372,7 +372,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector Date: Sat, 28 May 2022 15:52:39 -0700 Subject: [PATCH 033/240] Extra easier signature for peek --- Grid/lattice/Lattice_peekpoke.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index 5caab214..f3b485a4 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -125,6 +125,12 @@ void pokeSite(const sobj &s,Lattice &l,const Coordinate &site){ ////////////////////////////////////////////////////////// // Peek a scalar object from the SIMD array ////////////////////////////////////////////////////////// +template +typename vobj::scalar_object peekSite(const Lattice &l,const Coordinate &site){ + typename vobj::scalar_object s; + peekSite(s,l,site); + return s; +} template void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ From 34faa39f4f070a1ea230e63cc8e97976125bbdab Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 28 May 2022 17:18:08 -0700 Subject: [PATCH 034/240] Clean up Dirichlet. Big oops fix --- Grid/stencil/Stencil.h | 18 ++++++++++-------- systems/Perlmutter/dwf4.slurm | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 07598265..19eb19fb 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -665,7 +665,7 @@ public: this->_comms_recv[ii] = comm_dim; if ( block && comm_dim ) { assert(abs(displacement) < ld ); - + // Quiesce communication across block boundaries if( displacement > 0 ) { // High side, low side // | <--B--->| @@ -730,7 +730,7 @@ public: int gd = _grid->_gdimensions[dimension]; int fd = _grid->_fdimensions[dimension]; int pd = _grid->_processors [dimension]; - int ld = gd/pd; + // int ld = gd/pd; int rd = _grid->_rdimensions[dimension]; int pc = _grid->_processor_coor[dimension]; this->_permute_type[point]=_grid->PermuteType(dimension); @@ -871,12 +871,14 @@ public: for(int x=0;xPermuteType(dimension); + int permute_slice; int sx = (x+sshift)%rd; int offnode = 0; if ( simd_layout > 1 ) { + permute_slice=1; for(int i=0;i>(permute_type+1)); @@ -893,6 +895,7 @@ public: } else { int comm_proc = ((x+sshift)/rd)%pd; offnode = (comm_proc!= 0); + permute_slice=0; } int wraparound=0; @@ -906,19 +909,18 @@ public: // Wrap locally dirichlet support case OR node local if ( offnode==0 ) { - int permute_slice=0; + permute_slice=0; CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); } else { - if ( comms_recv==0 ) { + if ( comms_recv ) { - int permute_slice=1; - CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); + ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase } else { - ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase + CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); } @@ -1208,7 +1210,7 @@ public: face_table[face_idx].size()*sizeof(face_table_host[0])); } - if ( comms_send ) + if ( comms_send || comms_recv ) Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); face_idx++; diff --git a/systems/Perlmutter/dwf4.slurm b/systems/Perlmutter/dwf4.slurm index 8a37a266..426573d9 100644 --- a/systems/Perlmutter/dwf4.slurm +++ b/systems/Perlmutter/dwf4.slurm @@ -19,8 +19,8 @@ export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0 export MPICH_GPU_NO_ASYNC_MEMCPY=0 #export MPICH_SMP_SINGLE_COPY_MODE=CMA -OPT="--comms-overlap --shm-mpi 1" -VOL=64.64.32.32 +OPT="--comms-sequential --shm-mpi 1" +VOL=64.64.64.64 srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.1.1 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT #srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.1.1.4 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT #srun ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.8 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT From e762c940c24c6a99e7bde30daaef1986aa91e51c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 1 Jun 2022 14:29:25 -0700 Subject: [PATCH 035/240] Reduce the loop over exterior for GPU to indirection table --- Grid/qcd/action/fermion/WilsonCompressor.h | 8 +++++--- .../implementation/WilsonKernelsImplementation.h | 11 +++++++++++ Grid/stencil/Stencil.h | 4 +++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index e0e08c1c..eba04abf 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -297,7 +297,7 @@ public: void ZeroCountersi(void) { } void Reporti(int calls) { } - std::vector surface_list; + // Vector surface_list; WilsonStencil(GridBase *grid, int npoints, @@ -307,10 +307,11 @@ public: : CartesianStencil (grid,npoints,checkerboard,directions,distances,p) { ZeroCountersi(); - surface_list.resize(0); + // surface_list.resize(0); this->same_node.resize(npoints); }; + /* void BuildSurfaceList(int Ls,int vol4){ // find same node for SHM @@ -331,7 +332,8 @@ public: } } } - + */ + template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 9228b84c..623da5cf 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -440,6 +440,17 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); +#define KERNEL_CALL_EXT(A) \ + const uint64_t NN = Nsite*Ls; \ + const uint64_t sz = st.surface_list.size(); \ + auto ptr = &st.surface_list[0]; \ + accelerator_forNB( ss, sz, Simd::Nsimd(), { \ + int sF = ptr[ss]; \ + int sU = ss/Ls; \ + WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ + }); \ + accelerator_barrier(); + #define ASM_CALL(A) \ thread_for( ss, Nsite, { \ int sU = ss; \ diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 19eb19fb..eb73ba5f 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -640,7 +640,9 @@ public: } } if(local == 0) { - surface_list.push_back(site); + for(int s=0;s Date: Wed, 1 Jun 2022 19:25:42 -0400 Subject: [PATCH 036/240] Faster RNG init --- Grid/lattice/Lattice_rng.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index e5e63716..34df8da2 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -424,9 +424,30 @@ public: // MT implementation does not implement fast discard even though // in principle this is possible //////////////////////////////////////////////// +#if 1 + thread_for( lidx, _grid->lSites(), { + int gidx; + Coordinate pcoor; + Coordinate lcoor; + Coordinate gcoor; + _grid->LocalIndexToLocalCoor(lidx,lcoor); + pcoor=_grid->ThisProcessorCoor(); + _grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor); + + int o_idx; + int i_idx; + int rank; + _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); + assert(rank == _grid->ThisRank() ); + int l_idx=generator_idx(o_idx,i_idx); + _generators[l_idx] = master_engine; + Skip(_generators[l_idx],gidx); // Skip to next RNG sequence + }); +#else // Everybody loops over global volume. thread_for( gidx, _grid->_gsites, { + // Where is it? int rank; int o_idx; @@ -443,6 +464,7 @@ public: Skip(_generators[l_idx],gidx); // Skip to next RNG sequence } }); +#endif #else //////////////////////////////////////////////////////////////// // Machine and thread decomposition dependent seeding is efficient From 6f1a2e132beaac29be9acfe3f5e5bc375bfd8af0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 1 Jun 2022 19:26:06 -0400 Subject: [PATCH 037/240] SSC mark causing problems --- Grid/perfmon/PerfCount.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Grid/perfmon/PerfCount.h b/Grid/perfmon/PerfCount.h index dd25b41e..540b75c5 100644 --- a/Grid/perfmon/PerfCount.h +++ b/Grid/perfmon/PerfCount.h @@ -72,17 +72,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, inline uint64_t cyclecount(void){ return 0; } -#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx") -#define __SSC_STOP __SSC_MARK(0x110) -#define __SSC_START __SSC_MARK(0x111) - #else -#define __SSC_MARK(mark) -#define __SSC_STOP -#define __SSC_START - /* * cycle counters arch dependent */ From a25b32847f6e62fff99fc4dd223baa42188b55fd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 1 Jun 2022 19:26:37 -0400 Subject: [PATCH 038/240] Crusher patch --- configure.ac | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 9ab0595a..c8023939 100644 --- a/configure.ac +++ b/configure.ac @@ -394,11 +394,10 @@ case ${CXXTEST} in fi ;; hipcc) -# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" CXXLD=${CXX} if test $ac_openmp = yes; then - CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" + CXXFLAGS="$CXXFLAGS -fopenmp" fi ;; dpcpp) From 58a86c9164842ca5cdfa0098f3043f12482a4286 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 1 Jun 2022 19:27:06 -0400 Subject: [PATCH 039/240] SSC mark removal --- benchmarks/Benchmark_dwf.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 707f330c..c6814cdc 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -191,9 +191,7 @@ int main (int argc, char ** argv) std::cout<Barrier(); From 583f7c52f316ad0a803f87a4728dce98d34b8867 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 1 Jun 2022 19:27:29 -0400 Subject: [PATCH 040/240] SSC mark --- benchmarks/Benchmark_dwf_fp32.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 9695cf5a..5ee764c4 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -262,9 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet) std::cout<Barrier(); From b49db84b089bc8d91056bd878da5affa42c62bb4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 1 Jun 2022 19:27:42 -0400 Subject: [PATCH 041/240] Slurm updates --- systems/Crusher/dwf8.slurm | 24 +++++------------------- systems/Crusher/sourceme.sh | 2 +- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/systems/Crusher/dwf8.slurm b/systems/Crusher/dwf8.slurm index 2113dcd1..866ec775 100644 --- a/systems/Crusher/dwf8.slurm +++ b/systems/Crusher/dwf8.slurm @@ -6,9 +6,8 @@ #SBATCH -J DWF #SBATCH -o DWF.%J #SBATCH -e DWF.%J -#SBATCH -N 8 -#SBATCH -n 64 -#SBATCH --exclusive +#SBATCH -N 1 +#SBATCH -n 1 #SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4 DIR=. @@ -17,25 +16,12 @@ source setup.sh export MPICH_OFI_NIC_POLICY=GPU export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 export MPICH_GPU_SUPPORT_ENABLED=1 -export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM #export MPICH_SMP_SINGLE_COPY_MODE=CMA #export MPICH_SMP_SINGLE_COPY_MODE=NONE -export OMP_NUM_THREADS=1 +export OMP_NUM_THREADS=16 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -for vol in 64.64.64.256 64.64.64.128 32.32.32.256 32.32.32.128 -do -PARAMS=" --accelerator-threads 8 --grid $vol --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1" -echo $PARAMS -srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.${vol}.8node.shm-mpi1 -done - -PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1" -echo $PARAMS -srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node - -PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0" -echo $PARAMS -srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0 +srun --gpus-per-task 1 -N1 -n1 ./tests/Test_dwf_mixedcg_prec diff --git a/systems/Crusher/sourceme.sh b/systems/Crusher/sourceme.sh index 051014dc..ad0d6582 100644 --- a/systems/Crusher/sourceme.sh +++ b/systems/Crusher/sourceme.sh @@ -1,6 +1,6 @@ module load PrgEnv-gnu module load rocm/5.1.0 -module load cray-mpich/8.1.15 +module load cray-mpich/8.1.16 module load gmp #module load cray-fftw module load craype-accel-amd-gfx90a From 1ad54d049d2dc24575714c6e02f7ef6a01b3ddaa Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Thu, 2 Jun 2022 15:30:41 -0400 Subject: [PATCH 042/240] To PeriodicBC and ConjugateBC, added a new function "CshiftLink" which performs a boundary-aware C-shift of links or products of links. For the latter, the links crossing the global boundary are complex-conjugated. To the gauge implementations, added CshiftLink functions calling into the appropriate operation for the BC in a given direction. GaugeTransform, FourierAcceleratedGaugeFixer and WilsonLoops::FieldStrength no longer implicitly assume periodic boundary conditions; instead the shifted link is obtained using CshiftLink and is aware of the gauge implementation. Added an assert-check to ensure that the gauge fixing converges within the specified number of steps. Added functionality to compute the timeslice averaged plaquette Added functionality to compute the 5LI topological charge and timeslice topological charge Added a check of the properties of the charge conjugation matrix C=-gamma_2 gamma_4 to Test_gamma Fixed const correctness for Replicate Modified Test_fft_gfix to support either conjugate or periodic BCs, optionally disabling Fourier-accelerated gauge fixing, and tuning of alpha using cmdline options --- Grid/lattice/Lattice_transfer.h | 2 +- Grid/qcd/action/gauge/GaugeImplementations.h | 38 +++ Grid/qcd/utils/CovariantCshift.h | 41 +++ Grid/qcd/utils/GaugeFix.h | 62 +++-- Grid/qcd/utils/SUn.h | 24 +- Grid/qcd/utils/WilsonLoops.h | 251 ++++++++++++++++++- examples/Example_Laplacian.cc | 2 +- tests/core/Test_fft_gfix.cc | 187 ++++++++++---- tests/core/Test_gamma.cc | 60 +++++ 9 files changed, 572 insertions(+), 95 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index ef489ea6..aee55e93 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -855,7 +855,7 @@ void ExtractSliceLocal(Lattice &lowDim,const Lattice & higherDim,int template -void Replicate(Lattice &coarse,Lattice & fine) +void Replicate(const Lattice &coarse,Lattice & fine) { typedef typename vobj::scalar_object sobj; diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h index 16147c77..f518b236 100644 --- a/Grid/qcd/action/gauge/GaugeImplementations.h +++ b/Grid/qcd/action/gauge/GaugeImplementations.h @@ -69,6 +69,11 @@ public: return PeriodicBC::ShiftStaple(Link,mu); } + //Same as Cshift for periodic BCs + static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){ + return PeriodicBC::CshiftLink(Link,mu,shift); + } + static inline bool isPeriodicGaugeField(void) { return true; } }; @@ -110,6 +115,11 @@ public: return PeriodicBC::CovShiftBackward(Link, mu, field); } + //If mu is a conjugate BC direction + //Out(x) = U^dag_\mu(x-mu) | x_\mu != 0 + // = U^T_\mu(L-1) | x_\mu == 0 + //else + //Out(x) = U^dag_\mu(x-mu mod L) static inline GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { @@ -129,6 +139,13 @@ public: return PeriodicBC::CovShiftIdentityForward(Link,mu); } + + //If mu is a conjugate BC direction + //Out(x) = S_\mu(x+mu) | x_\mu != L-1 + // = S*_\mu(x+mu) | x_\mu == L-1 + //else + //Out(x) = S_\mu(x+mu mod L) + //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) { assert(_conjDirs.size() == Nd); @@ -138,6 +155,27 @@ public: return PeriodicBC::ShiftStaple(Link,mu); } + //Boundary-aware C-shift of gauge links / gauge transformation matrices + //For conjugate BC direction + //shift = 1 + //Out(x) = U_\mu(x+\hat\mu) | x_\mu != L-1 + // = U*_\mu(0) | x_\mu == L-1 + //shift = -1 + //Out(x) = U_\mu(x-mu) | x_\mu != 0 + // = U*_\mu(L-1) | x_\mu == 0 + //else + //shift = 1 + //Out(x) = U_\mu(x+\hat\mu mod L) + //shift = -1 + //Out(x) = U_\mu(x-\hat\mu mod L) + static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){ + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CshiftLink(Link,mu,shift); + else + return PeriodicBC::CshiftLink(Link,mu,shift); + } + static inline void setDirections(std::vector &conjDirs) { _conjDirs=conjDirs; } static inline std::vector getDirections(void) { return _conjDirs; } static inline bool isPeriodicGaugeField(void) { return false; } diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index 6c70706f..79cf8e0f 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -88,6 +88,12 @@ namespace PeriodicBC { return CovShiftBackward(Link,mu,arg); } + //Boundary-aware C-shift of gauge links / gauge transformation matrices + template Lattice + CshiftLink(const Lattice &Link, int mu, int shift) + { + return Cshift(Link, mu, shift); + } } @@ -158,6 +164,9 @@ namespace ConjugateBC { // std::cout<<"Gparity::CovCshiftBackward mu="< Lattice CovShiftIdentityBackward(const Lattice &Link, int mu) { GridBase *grid = Link.Grid(); @@ -176,6 +185,9 @@ namespace ConjugateBC { return Link; } + //Out(x) = S_\mu(x+\hat\mu) | x_\mu != L-1 + // = S*_\mu(0) | x_\mu == L-1 + //Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices template Lattice ShiftStaple(const Lattice &Link, int mu) { @@ -208,6 +220,35 @@ namespace ConjugateBC { return CovShiftBackward(Link,mu,arg); } + //Boundary-aware C-shift of gauge links / gauge transformation matrices + //shift = 1 + //Out(x) = U_\mu(x+\hat\mu) | x_\mu != L-1 + // = U*_\mu(0) | x_\mu == L-1 + //shift = -1 + //Out(x) = U_\mu(x-mu) | x_\mu != 0 + // = U*_\mu(L-1) | x_\mu == 0 + template Lattice + CshiftLink(const Lattice &Link, int mu, int shift) + { + GridBase *grid = Link.Grid(); + int Lmu = grid->GlobalDimensions()[mu] - 1; + + Lattice> coor(grid); + LatticeCoordinate(coor, mu); + + Lattice tmp(grid); + if(shift == 1){ + tmp = Cshift(Link, mu, 1); + tmp = where(coor == Lmu, conjugate(tmp), tmp); + return tmp; + }else if(shift == -1){ + tmp = Link; + tmp = where(coor == Lmu, conjugate(tmp), tmp); + return Cshift(tmp, mu, -1); + }else assert(0 && "Invalid shift value"); + return tmp; //shuts up the compiler fussing about the return type + } + } diff --git a/Grid/qcd/utils/GaugeFix.h b/Grid/qcd/utils/GaugeFix.h index 2b3384da..c0bc2c83 100644 --- a/Grid/qcd/utils/GaugeFix.h +++ b/Grid/qcd/utils/GaugeFix.h @@ -40,27 +40,46 @@ public: typedef typename Gimpl::GaugeLinkField GaugeMat; typedef typename Gimpl::GaugeField GaugeLorentz; - static void GaugeLinkToLieAlgebraField(const std::vector &U,std::vector &A) { - for(int mu=0;mu &A,GaugeMat &dmuAmu,int orthog) { + + //The derivative of the Lie algebra field + static void DmuAmu(const std::vector &U, GaugeMat &dmuAmu,int orthog) { + GridBase* grid = U[0].Grid(); + GaugeMat Ax(grid); + GaugeMat Axm1(grid); + GaugeMat Utmp(grid); + dmuAmu=Zero(); for(int mu=0;mu &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) { + static Real SteepestDescentStep(std::vector &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) { GridBase *grid = U[0].Grid(); - std::vector A(Nd,grid); GaugeMat g(grid); - - GaugeLinkToLieAlgebraField(U,A); - ExpiAlphaDmuAmu(A,g,alpha,dmuAmu,orthog); - + ExpiAlphaDmuAmu(U,g,alpha,dmuAmu,orthog); Real vol = grid->gSites(); Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc; xform = g*xform ; - SU::GaugeTransform(U,g); + SU::GaugeTransform(U,g); return trG; } - static Real FourierAccelSteepestDescentStep(std::vector &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) { + static Real FourierAccelSteepestDescentStep(std::vector &U,GaugeMat &xform, Real alpha, GaugeMat & dmuAmu,int orthog) { GridBase *grid = U[0].Grid(); @@ -157,11 +173,7 @@ public: GaugeMat g(grid); GaugeMat dmuAmu_p(grid); - std::vector A(Nd,grid); - - GaugeLinkToLieAlgebraField(U,A); - - DmuAmu(A,dmuAmu,orthog); + DmuAmu(U,dmuAmu,orthog); std::vector mask(Nd,1); for(int mu=0;mu::GaugeTransform(U,g); + SU::GaugeTransform(U,g); return trG; } - static void ExpiAlphaDmuAmu(const std::vector &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu,int orthog) { + static void ExpiAlphaDmuAmu(const std::vector &U,GaugeMat &g, Real alpha, GaugeMat &dmuAmu,int orthog) { GridBase *grid = g.Grid(); Complex cialpha(0.0,-alpha); GaugeMat ciadmam(grid); - DmuAmu(A,dmuAmu,orthog); + DmuAmu(U,dmuAmu,orthog); ciadmam = dmuAmu*cialpha; SU::taExp(ciadmam,g); } diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 675493b3..b9660c65 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -694,32 +694,32 @@ public: * Adjoint rep gauge xform */ - template - static void GaugeTransform( GaugeField &Umu, GaugeMat &g){ + template + static void GaugeTransform(typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){ GridBase *grid = Umu.Grid(); conformable(grid,g.Grid()); - GaugeMat U(grid); - GaugeMat ag(grid); ag = adj(g); + typename Gimpl::GaugeLinkField U(grid); + typename Gimpl::GaugeLinkField ag(grid); ag = adj(g); for(int mu=0;mu(Umu,mu); - U = g*U*Cshift(ag, mu, 1); + U = g*U*Gimpl::CshiftLink(ag, mu, 1); //BC-aware PokeIndex(Umu,U,mu); } } - template - static void GaugeTransform( std::vector &U, GaugeMat &g){ + template + static void GaugeTransform( std::vector &U, typename Gimpl::GaugeLinkField &g){ GridBase *grid = g.Grid(); - GaugeMat ag(grid); ag = adj(g); + typename Gimpl::GaugeLinkField ag(grid); ag = adj(g); for(int mu=0;mu - static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g){ + template + static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeField &Umu, typename Gimpl::GaugeLinkField &g){ LieRandomize(pRNG,g,1.0); - GaugeTransform(Umu,g); + GaugeTransform(Umu,g); } // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 ) diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index 0367c9fa..da1f5ac8 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -125,6 +125,56 @@ public: return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME } + ////////////////////////////////////////////////// + // sum over all spatial planes of plaquette + ////////////////////////////////////////////////// + static void siteSpatialPlaquette(ComplexField &Plaq, + const std::vector &U) { + ComplexField sitePlaq(U[0].Grid()); + Plaq = Zero(); + for (int mu = 1; mu < Nd-1; mu++) { + for (int nu = 0; nu < mu; nu++) { + traceDirPlaquette(sitePlaq, U, mu, nu); + Plaq = Plaq + sitePlaq; + } + } + } + + //////////////////////////////////// + // sum over all x,y,z and over all spatial planes of plaquette + ////////////////////////////////////////////////// + static std::vector timesliceSumSpatialPlaquette(const GaugeLorentz &Umu) { + std::vector U(Nd, Umu.Grid()); + // inefficient here + for (int mu = 0; mu < Nd; mu++) { + U[mu] = PeekIndex(Umu, mu); + } + + ComplexField Plaq(Umu.Grid()); + + siteSpatialPlaquette(Plaq, U); + typedef typename ComplexField::scalar_object sobj; + std::vector Tq; + sliceSum(Plaq, Tq, Nd-1); + + std::vector out(Tq.size()); + for(int t=0;t timesliceAvgSpatialPlaquette(const GaugeLorentz &Umu) { + std::vector sumplaq = timesliceSumSpatialPlaquette(Umu); + int Lt = Umu.Grid()->FullDimensions()[Nd-1]; + assert(sumplaq.size() == Lt); + double vol = Umu.Grid()->gSites() / Lt; + double faces = (1.0 * (Nd - 1)* (Nd - 2)) / 2.0; + for(int t=0;t(Umu, mu); // some redundant copies GaugeMat vu = v*u; //FS = 0.25*Ta(u*v + Cshift(vu, mu, -1)); - FS = (u*v + Cshift(vu, mu, -1)); + FS = (u*v + Gimpl::CshiftLink(vu, mu, -1)); FS = 0.125*(FS - adj(FS)); } - static Real TopologicalCharge(GaugeLorentz &U){ + static Real TopologicalCharge(const GaugeLorentz &U){ // 4d topological charge assert(Nd==4); // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y) @@ -390,6 +440,203 @@ public: } + //Clover-leaf Wilson loop combination for arbitrary mu-extent M and nu extent N, mu >= nu + //cf https://arxiv.org/pdf/hep-lat/9701012.pdf Eq 7 for 1x2 Wilson loop + //Clockwise ordering + static void CloverleafMxN(GaugeMat &FS, const GaugeMat &Umu, const GaugeMat &Unu, int mu, int nu, int M, int N){ +#define Fmu(A) Gimpl::CovShiftForward(Umu, mu, A) +#define Bmu(A) Gimpl::CovShiftBackward(Umu, mu, A) +#define Fnu(A) Gimpl::CovShiftForward(Unu, nu, A) +#define Bnu(A) Gimpl::CovShiftBackward(Unu, nu, A) +#define FmuI Gimpl::CovShiftIdentityForward(Umu, mu) +#define BmuI Gimpl::CovShiftIdentityBackward(Umu, mu) +#define FnuI Gimpl::CovShiftIdentityForward(Unu, nu) +#define BnuI Gimpl::CovShiftIdentityBackward(Unu, nu) + + //Upper right loop + GaugeMat tmp = BmuI; + for(int i=1;i(U, mu); + GaugeMat Unu = PeekIndex(U, nu); + if(M == N){ + GaugeMat F(Umu.Grid()); + CloverleafMxN(F, Umu, Unu, mu, nu, M, N); + FS = 0.125 * ( F - adj(F) ); + }else{ + //Average over both orientations + GaugeMat horizontal(Umu.Grid()), vertical(Umu.Grid()); + CloverleafMxN(horizontal, Umu, Unu, mu, nu, M, N); + CloverleafMxN(vertical, Umu, Unu, mu, nu, N, M); + FS = 0.0625 * ( horizontal - adj(horizontal) + vertical - adj(vertical) ); + } + } + + //Topological charge contribution from MxN Wilson loops + //cf https://arxiv.org/pdf/hep-lat/9701012.pdf Eq 6 + //output is the charge by timeslice: sum over timeslices to obtain the total + static std::vector TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){ + assert(Nd == 4); + std::vector > F(Nd,std::vector(Nd,nullptr)); + //Note F_numu = - F_munu + //hence we only need to loop over mu,nu,rho,sigma that aren't related by permuting mu,nu or rho,sigma + //Use nu > mu + for(int mu=0;mu Tq; + sliceSum(fsum, Tq, Nd-1); + + std::vector out(Tq.size()); + for(int t=0;t Tq = TimesliceTopologicalChargeMxN(U,M,N); + Real out(0); + for(int t=0;t > TimesliceTopologicalCharge5LiContributions(const GaugeLorentz &U){ + static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} }; + std::vector > out(5); + for(int i=0;i<5;i++){ + out[i] = TimesliceTopologicalChargeMxN(U,exts[i][0],exts[i][1]); + } + return out; + } + + static std::vector TopologicalCharge5LiContributions(const GaugeLorentz &U){ + static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} }; + std::vector out(5); + std::cout << GridLogMessage << "Computing topological charge" << std::endl; + for(int i=0;i<5;i++){ + out[i] = TopologicalChargeMxN(U,exts[i][0],exts[i][1]); + std::cout << GridLogMessage << exts[i][0] << "x" << exts[i][1] << " Wilson loop contribution " << out[i] << std::endl; + } + return out; + } + + //Compute the 5Li topological charge + static std::vector TimesliceTopologicalCharge5Li(const GaugeLorentz &U){ + std::vector > loops = TimesliceTopologicalCharge5LiContributions(U); + + double c5=1./20.; + double c4=1./5.-2.*c5; + double c3=(-64.+640.*c5)/45.; + double c2=(1-64.*c5)/9.; + double c1=(19.-55.*c5)/9.; + + int Lt = loops[0].size(); + std::vector out(Lt,0.); + for(int t=0;t Qt = TimesliceTopologicalCharge5Li(U); + Real Q = 0.; + for(int t=0;t::RandomGaugeTransform(RNG,U_GT,g); // Unit gauge + SU::RandomGaugeTransform(RNG,U_GT,g); // Unit gauge Field in_GT(&Grid); Field out_GT(&Grid); diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index 87dbc242..6d617e25 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -29,14 +29,10 @@ Author: Peter Boyle #include using namespace Grid; - ; -int main (int argc, char ** argv) -{ +template +void run(double alpha, bool do_fft_gfix){ std::vector seeds({1,2,3,4}); - - Grid_init(&argc,&argv); - int threads = GridThread::GetThreads(); Coordinate latt_size = GridDefaultLatt(); @@ -55,10 +51,7 @@ int main (int argc, char ** argv) FFT theFFT(&GRID); std::cout<::ColdConfiguration(pRNG,Umu); // Unit gauge Uorg=Umu; + + Real init_plaq=WilsonLoops::avgPlaquette(Umu); + std::cout << " Initial plaquette "<< init_plaq << std::endl; + + //Apply a random gauge transformation to the unit gauge config Urnd=Umu; + SU::RandomGaugeTransform(pRNG,Urnd,g); - SU::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge - - Real plaq=WilsonLoops::avgPlaquette(Umu); - std::cout << " Initial plaquette "<::SteepestDescentGaugeFix(Umu,xform1,alpha,10000,1.0e-12, 1.0e-12,false); + FourierAcceleratedGaugeFixer::SteepestDescentGaugeFix(Umu,xform1,alpha,10000,1.0e-12, 1.0e-12,false); // Check the gauge xform matrices Utmp=Urnd; - SU::GaugeTransform(Utmp,xform1); + SU::GaugeTransform(Utmp,xform1); Utmp = Utmp - Umu; - std::cout << " Norm Difference of xformed gauge "<< norm2(Utmp) << std::endl; + std::cout << " Check the output gauge transformation matrices applied to the original field produce the xformed field "<< norm2(Utmp) << " (expect 0)" << std::endl; - plaq=WilsonLoops::avgPlaquette(Umu); - std::cout << " Final plaquette "<::avgPlaquette(Umu); + std::cout << " Final plaquette "<::SteepestDescentGaugeFix(Umu,xform2,alpha,10000,1.0e-12, 1.0e-12,true); + + Utmp=Urnd; + SU::GaugeTransform(Utmp,xform2); + Utmp = Utmp - Umu; + std::cout << " Check the output gauge transformation matrices applied to the original field produce the xformed field "<< norm2(Utmp) << " (expect 0)" << std::endl; - std::cout<< "*****************************************************************" <::SteepestDescentGaugeFix(Umu,xform2,alpha,10000,1.0e-12, 1.0e-12,true); + plaq=WilsonLoops::avgPlaquette(Umu); + std::cout << " Final plaquette "<::GaugeTransform(Utmp,xform2); - Utmp = Utmp - Umu; - std::cout << " Norm Difference of xformed gauge "<< norm2(Utmp) << std::endl; + std::cout<< "******************************************************************************************" <::HotConfiguration(pRNG,Umu); - plaq=WilsonLoops::avgPlaquette(Umu); - std::cout << " Final plaquette "<::avgPlaquette(Umu); + std::cout << " Initial plaquette "<< init_plaq << std::endl; - std::cout<< "*****************************************************************" <::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-12, 1.0e-12,false); - SU::HotConfiguration(pRNG,Umu); // Unit gauge + plaq=WilsonLoops::avgPlaquette(Umu); + std::cout << " Final plaquette "<::avgPlaquette(Umu); - std::cout << " Initial plaquette "<::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-12, 1.0e-12,true); + SU::HotConfiguration(pRNG,Umu); - plaq=WilsonLoops::avgPlaquette(Umu); - std::cout << " Final plaquette "<::avgPlaquette(Umu); + std::cout << " Initial plaquette "<< init_plaq << std::endl; - std::cout<< "*****************************************************************" <::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-12, 1.0e-12,true); + + plaq=WilsonLoops::avgPlaquette(Umu); + std::cout << " Final plaquette "<::HotConfiguration(pRNG,Umu); // Unit gauge + SU::HotConfiguration(pRNG,Umu); - plaq=WilsonLoops::avgPlaquette(Umu); - std::cout << " Initial plaquette "<::avgPlaquette(Umu); + std::cout << " Initial plaquette "<< init_plaq << std::endl; - FourierAcceleratedGaugeFixer::SteepestDescentGaugeFix(Umu,xform3,alpha,10000,1.0e-12, 1.0e-12,true,coulomb_dir); + FourierAcceleratedGaugeFixer::SteepestDescentGaugeFix(Umu,xform3,alpha,10000,1.0e-12, 1.0e-12,false,coulomb_dir); - std::cout << Umu<::avgPlaquette(Umu); + std::cout << " Final plaquette "<::avgPlaquette(Umu); - std::cout << " Final plaquette "<::HotConfiguration(pRNG,Umu); + + init_plaq=WilsonLoops::avgPlaquette(Umu); + std::cout << " Initial plaquette "<< init_plaq << std::endl; + + FourierAcceleratedGaugeFixer::SteepestDescentGaugeFix(Umu,xform3,alpha,10000,1.0e-12, 1.0e-12,true,coulomb_dir); + + plaq=WilsonLoops::avgPlaquette(Umu); + std::cout << " Final plaquette "<> alpha; + } + } + + + if(gimpl == "periodic"){ + std::cout << GridLogMessage << "Using periodic boundary condition" << std::endl; + run(alpha, do_fft_gfix); + }else{ + std::vector conjdirs = {1,1,0,0}; //test with 2 conjugate dirs and 2 not + std::cout << GridLogMessage << "Using complex conjugate boundary conditions in dimensions "; + for(int i=0;i(alpha, do_fft_gfix); + } + Grid_finalize(); } diff --git a/tests/core/Test_gamma.cc b/tests/core/Test_gamma.cc index e52049fe..05f8c505 100644 --- a/tests/core/Test_gamma.cc +++ b/tests/core/Test_gamma.cc @@ -228,6 +228,59 @@ void checkGammaL(const Gamma::Algebra a, GridSerialRNG &rng) std::cout << std::endl; } +void checkChargeConjMatrix(){ + //Check the properties of the charge conjugation matrix + //In the Grid basis C = -\gamma^2 \gamma^4 + SpinMatrix C = testAlgebra[Gamma::Algebra::MinusGammaY] * testAlgebra[Gamma::Algebra::GammaT]; + SpinMatrix mC = -C; + SpinMatrix one = testAlgebra[Gamma::Algebra::Identity]; + + std::cout << "Testing properties of charge conjugation matrix C = -\\gamma^2 \\gamma^4 (in Grid's basis)" << std::endl; + + //C^T = -C + SpinMatrix Ct = transpose(C); + std::cout << GridLogMessage << "C^T=-C "; + test(Ct, mC); + std::cout << std::endl; + + //C^\dagger = -C + SpinMatrix Cdag = adj(C); + std::cout << GridLogMessage << "C^dag=-C "; + test(Cdag, mC); + std::cout << std::endl; + + //C^* = C + SpinMatrix Cstar = conjugate(C); + std::cout << GridLogMessage << "C^*=C "; + test(Cstar, C); + std::cout << std::endl; + + //C^{-1} = -C + SpinMatrix CinvC = mC * C; + std::cout << GridLogMessage << "C^{-1}=-C "; + test(CinvC, one); + std::cout << std::endl; + + // C^{-1} \gamma^\mu C = -[\gamma^\mu]^T + Gamma::Algebra gmu_a[4] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT }; + for(int mu=0;mu<4;mu++){ + SpinMatrix gmu = testAlgebra[gmu_a[mu]]; + SpinMatrix Cinv_gmu_C = mC * gmu * C; + SpinMatrix mgmu_T = -transpose(gmu); + std::cout << GridLogMessage << "C^{-1} \\gamma^" << mu << " C = -[\\gamma^" << mu << "]^T "; + test(Cinv_gmu_C, mgmu_T); + std::cout << std::endl; + } + + //[C, \gamma^5] = 0 + SpinMatrix Cg5 = C * testAlgebra[Gamma::Algebra::Gamma5]; + SpinMatrix g5C = testAlgebra[Gamma::Algebra::Gamma5] * C; + std::cout << GridLogMessage << "C \\gamma^5 = \\gamma^5 C"; + test(Cg5, g5C); + std::cout << std::endl; +} + + int main(int argc, char *argv[]) { Grid_init(&argc,&argv); @@ -270,6 +323,13 @@ int main(int argc, char *argv[]) { checkGammaL(i, sRNG); } + + std::cout << GridLogMessage << "======== Charge conjugation matrix check" << std::endl; + checkChargeConjMatrix(); + std::cout << GridLogMessage << std::endl; + + + Grid_finalize(); From e9648a1635f3ee50f9e0e732b8e7aa05cf66726c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Jun 2022 23:40:04 -0400 Subject: [PATCH 043/240] Useful periodic print. CG convergence bound is remarkably accurate on low eigenvalue in numerical tests --- Grid/algorithms/iterative/ConjugateGradient.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index 14f3d306..c20e267d 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -152,8 +152,13 @@ public: LinearCombTimer.Stop(); LinalgTimer.Stop(); - std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k + if ( (k % 500) == 0 ) { + std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; + } else { + std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k + << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; + } // Stopping condition if (cp <= rsq) { From 05ca7dc2529e488acc15b5477fb208204690d474 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Jun 2022 23:41:05 -0400 Subject: [PATCH 044/240] Const correctness --- Grid/qcd/action/fermion/FermionOperator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/FermionOperator.h b/Grid/qcd/action/fermion/FermionOperator.h index 0c159300..66644d7f 100644 --- a/Grid/qcd/action/fermion/FermionOperator.h +++ b/Grid/qcd/action/fermion/FermionOperator.h @@ -49,7 +49,7 @@ public: virtual FermionField &tmp(void) = 0; - virtual void DirichletBlock(Coordinate & _Block) { assert(0); }; + virtual void DirichletBlock(const Coordinate & _Block) { assert(0); }; GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); }; From 501bb117bf349bd2a447d9d60ebe97b82b368998 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 15 Jun 2022 00:04:09 -0400 Subject: [PATCH 045/240] Const correct --- Grid/qcd/action/fermion/WilsonFermion5D.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 91abf86a..eced6b81 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -178,16 +178,8 @@ public: GridRedBlackCartesian &FourDimRedBlackGrid, double _M5,const ImplParams &p= ImplParams()); - virtual void DirichletBlock(Coordinate & block) + virtual void DirichletBlock(const Coordinate & block) { - assert(block.size()==Nd+1); - if ( block[0] || block[1] || block[2] || block[3] || block[4] ){ - Dirichlet = 1; - Block = block; - Stencil.DirichletBlock(block); - StencilEven.DirichletBlock(block); - StencilOdd.DirichletBlock(block); - } } // Constructors /* From fdef7a1a8cedad899a073d4ad46c58c7514d069d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 15 Jun 2022 00:05:20 -0400 Subject: [PATCH 046/240] Dirichlet fix --- .../implementation/WilsonFermion5DImplementation.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 7775ad9d..681a6914 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -92,6 +92,15 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]); } + if ( p.dirichlet.size() ) { + Coordinate block = p.dirichlet; + assert(block.size()==Nd+1); + if ( block[0] || block[1] || block[2] || block[3] || block[4] ){ + Dirichlet = 1; + Block = block; + } + } + if (Impl::LsVectorised) { int nsimd = Simd::Nsimd(); From 6efd80f104c48d08e24dd79e70679b5d77bf18b7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 15 Jun 2022 18:23:46 -0400 Subject: [PATCH 047/240] Printing --- Grid/qcd/hmc/integrators/Integrator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 070cbea1..1985caf0 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -145,7 +145,7 @@ protected: MomFilter->applyFilter(force); std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR; From 0e9666bc92341b8ff31af298aa13cdc2b5f84cab Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 15 Jun 2022 19:18:42 -0400 Subject: [PATCH 048/240] Test update --- systems/Crusher/dwf8.slurm | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/systems/Crusher/dwf8.slurm b/systems/Crusher/dwf8.slurm index 866ec775..4bc1917a 100644 --- a/systems/Crusher/dwf8.slurm +++ b/systems/Crusher/dwf8.slurm @@ -7,8 +7,9 @@ #SBATCH -o DWF.%J #SBATCH -e DWF.%J #SBATCH -N 1 -#SBATCH -n 1 -#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4 +#SBATCH -n 8 +##SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4 +#SBATCH --gpu-bind=map_gpu:0,1,2,3,6,7,4,5 DIR=. source setup.sh @@ -23,5 +24,5 @@ export OMP_NUM_THREADS=16 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -srun --gpus-per-task 1 -N1 -n1 ./tests/Test_dwf_mixedcg_prec +srun --gpus-per-task 1 -N1 -n8 ./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.1 --shm-mpi 1 --shm 2048 --comms-sequential --accelerator-threads 8 From d10d30dda84bd8f113197fe759048e7be2e0e349 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 15 Jun 2022 19:18:58 -0400 Subject: [PATCH 049/240] Script update --- systems/Crusher/dwf4.slurm | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/systems/Crusher/dwf4.slurm b/systems/Crusher/dwf4.slurm index 6bb953c4..8e2a6e4c 100644 --- a/systems/Crusher/dwf4.slurm +++ b/systems/Crusher/dwf4.slurm @@ -7,21 +7,19 @@ #SBATCH -o DWF.%J #SBATCH -e DWF.%J #SBATCH -N 1 -#SBATCH -n 4 -#SBATCH --exclusive +#SBATCH -n 2 +#SBATCH --gpu-bind=map_gpu:0,1 DIR=. -module list +source setup.sh + +export MPICH_OFI_NIC_POLICY=GPU export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 export MPICH_GPU_SUPPORT_ENABLED=1 -#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM -export MPICH_SMP_SINGLE_COPY_MODE=NONE -#export MPICH_SMP_SINGLE_COPY_MODE=CMA -export OMP_NUM_THREADS=4 +export OMP_NUM_THREADS=16 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads 8 --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0" -srun --gpus-per-task 1 -n4 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS +srun --gpus-per-task 1 -N1 -n2 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.16.32.64 --shm-mpi 1 --shm 2048 --comms-sequential --accelerator-threads 8 From 31efa5c4da075683d636f05faea7b6b965f648ce Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 15 Jun 2022 19:19:44 -0400 Subject: [PATCH 050/240] Script updates for current summit --- systems/Summit/config-command | 1 + systems/Summit/dwf16.lsf | 30 ++++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/systems/Summit/config-command b/systems/Summit/config-command index b565addc..46e37af1 100644 --- a/systems/Summit/config-command +++ b/systems/Summit/config-command @@ -7,6 +7,7 @@ --enable-setdevice \ --disable-fermion-reps \ --enable-accelerator=cuda \ + --disable-accelerator-cshift \ --prefix /ccs/home/paboyle/prefix \ CXX=nvcc \ LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \ diff --git a/systems/Summit/dwf16.lsf b/systems/Summit/dwf16.lsf index ef8c21a5..3242fc86 100644 --- a/systems/Summit/dwf16.lsf +++ b/systems/Summit/dwf16.lsf @@ -1,25 +1,39 @@ #!/bin/bash #BSUB -P LGT104 -#BSUB -W 2:00 +#BSUB -W 0:20 #BSUB -nnodes 16 #BSUB -J DWF + export OMP_NUM_THREADS=6 export PAMI_IBV_ADAPTER_AFFINITY=1 export PAMI_ENABLE_STRIPING=1 -export OPT="--comms-concurrent --comms-overlap " -APP="./benchmarks/Benchmark_comms_host_device --mpi 4.4.4.3 " -jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.16node.log +DIR=. +source sourceme.sh -APP="./benchmarks/Benchmark_dwf_fp32 --grid 96.96.96.72 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT " -jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.24.log +echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -APP="./benchmarks/Benchmark_dwf_fp32 --grid 128.128.128.96 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT " -jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.32.log +VOLS=( 32.32.32.16 32.32.32.64 64.32.32.64 64.32.64.64 64.64.64.64 64.64.64.128 64.64.64.256 64.64.64.512 128.64.64.64.512) +MPI=( 1.1.1.1 1.1.1.4 2.1.1.4 2.1.2.4 2.2.2.4 2.2.2.8 2.2.2.16 2.2.2.32 4.4.2.32 ) +RANKS=( 1 4 8 16 32 64 128 256 1024) +NODES=( 1 1 2 4 8 16 32 64 128) +INTS=( 0 1 2 3 4 5 6 7 8) +for i in 5 +do + vol=${VOLS[$i]} + nodes=${NODES[$i]} + mpi=${MPI[$i]} + ranks=${RANKS[$i]} + JSRUN="jsrun --nrs $nodes -a4 -g4 -c42 -dpacked -b packed:10 --latency_priority gpu-cpu --smpiargs=-gpu" + PARAMS=" --accelerator-threads 8 --grid $vol --mpi $mpi --comms-sequential --shm 2048 --shm-mpi 0" + $JSRUN ./benchmarks/Benchmark_dwf_fp32 $PARAMS > run.v${vol}.n${nodes}.m${mpi}.seq.ker + PARAMS=" --accelerator-threads 8 --grid $vol --mpi $mpi --comms-overlap --shm 2048 --shm-mpi 0" + $JSRUN ./benchmarks/Benchmark_dwf_fp32 $PARAMS > run.v${vol}.n${nodes}.m${mpi}.over.ker +done From fd933420c65b927da2c12f6c355e26f7d610db1f Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 22 Jun 2022 10:27:48 -0400 Subject: [PATCH 051/240] Imported changes from feature/gparity_HMC branch: Added a bounds-check function for the RHMC with arbitrary power Added a pseudofermion action for the rational ratio with an arbitrary power and a mixed-precision variant of the same. The existing one-flavor rational ratio class now uses the general class under the hood To support testing of the two-flavor even-odd ratio pseudofermion, separated the functionality of generating the random field and performing the heatbath step, and added a method to obtain the pseudofermion field Added a new HMC runner start type: CheckpointStartReseed, which reseeds the RNG from scratch, allowing for the creation of new evolution streams from an existing checkpoint. Added log output of seeds used when the RNG is seeded. EOFA changes: To support mixed-precision inversion, generalized the class to maintain a separate solver for the L and R operators in the heatbath (separate solvers are already implemented for the other stages) To support mixed-precision, the action of setting the operator shift coefficients is now maintained in a virtual function. A derived class for mixed-precision solvers ensures the coefficients are applied to both the double and single-prec operators The ||^2 of the random source is now stored by the heatbath and compared to the initial action when it is computed. These should be equal but may differ if the rational bounds are not chosen correctly, hence serving as a useful and free test Fixed calculation of M_eofa (previously incomplete and #if'd out) Added functionality to compute M_eofa^-1 to complement the calculation of M_eofa (both are equally expensive!) To support testing, separated the functionality of generating the random field and performing the heatbath step, and added a method to obtain the pseudofermion field Added a test program which computes the G-parity force using the 1 and 2 flavor implementations and compares the result. Test supports DWF, EOFA and DSDR actions, chosen by a command line option. The Mobius EOFA force test now also checks the rational approximation used for the heatbath Added a test program for the mixed precision EOFA compared to the double-prec implementation, G-parity HMC test now applied GPBC in the y direction and not the t direction (GPBC in t are no longer supported) and checkpoints after every configuration Added a test program which computes the two-flavor G-parity action (via RHMC) with both the 1 and 2 flavor implementations and checks they agree Added a test program to check the implementation of M_eofa^{-1} --- Grid/qcd/action/ActionParams.h | 52 +- Grid/qcd/action/pseudofermion/Bounds.h | 60 ++- .../pseudofermion/ExactOneFlavourRatio.h | 229 +++++++-- .../GeneralEvenOddRationalRatio.h | 372 +++++++++++++++ .../GeneralEvenOddRationalRatioMixedPrec.h | 93 ++++ .../OneFlavourEvenOddRationalRatio.h | 262 +--------- Grid/qcd/action/pseudofermion/PseudoFermion.h | 2 + .../pseudofermion/TwoFlavourEvenOddRatio.h | 28 +- Grid/qcd/hmc/GenericHMCrunner.h | 12 +- Grid/qcd/hmc/HMCModules.h | 2 + Grid/qcd/hmc/integrators/Integrator.h | 6 +- tests/forces/Test_gpdwf_force_1f_2f.cc | 446 ++++++++++++++++++ tests/forces/Test_mobius_force_eofa.cc | 42 ++ .../forces/Test_mobius_gparity_eofa_mixed.cc | 233 +++++++++ tests/hmc/Test_action_dwf_gparity2fvs1f.cc | 257 ++++++++++ tests/hmc/Test_hmc_GparityIwasakiGauge.cc | 4 +- tests/solver/Test_eofa_inv.cc | 125 +++++ 17 files changed, 1920 insertions(+), 305 deletions(-) create mode 100644 Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h create mode 100644 Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatioMixedPrec.h create mode 100644 tests/forces/Test_gpdwf_force_1f_2f.cc create mode 100644 tests/forces/Test_mobius_gparity_eofa_mixed.cc create mode 100644 tests/hmc/Test_action_dwf_gparity2fvs1f.cc create mode 100644 tests/solver/Test_eofa_inv.cc diff --git a/Grid/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h index 6a3f053a..13f2e594 100644 --- a/Grid/qcd/action/ActionParams.h +++ b/Grid/qcd/action/ActionParams.h @@ -37,6 +37,7 @@ NAMESPACE_BEGIN(Grid); // These can move into a params header and be given MacroMagic serialisation struct GparityWilsonImplParams { Coordinate twists; + //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs GparityWilsonImplParams() : twists(Nd, 0) {}; }; @@ -66,7 +67,8 @@ struct StaggeredImplParams { RealD, mdtolerance, int, degree, int, precision, - int, BoundsCheckFreq); + int, BoundsCheckFreq, + RealD, BoundsCheckTol); // MaxIter and tolerance, vectors?? @@ -78,7 +80,8 @@ struct StaggeredImplParams { int _degree = 10, int _precision = 64, int _BoundsCheckFreq=20, - RealD mdtol = 1.0e-6) + RealD mdtol = 1.0e-6, + double _BoundsCheckTol=1e-6) : lo(_lo), hi(_hi), MaxIter(_maxit), @@ -86,9 +89,52 @@ struct StaggeredImplParams { mdtolerance(mdtol), degree(_degree), precision(_precision), - BoundsCheckFreq(_BoundsCheckFreq){}; + BoundsCheckFreq(_BoundsCheckFreq), + BoundsCheckTol(_BoundsCheckTol){}; }; + /*Action parameters for the generalized rational action + The approximation is for (M^dag M)^{1/inv_pow} + where inv_pow is the denominator of the fractional power. + Default inv_pow=2 for square root, making this equivalent to + the OneFlavourRational action + */ + struct RationalActionParams : Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams, + int, inv_pow, + RealD, lo, //low eigenvalue bound of rational approx + RealD, hi, //high eigenvalue bound of rational approx + int, MaxIter, //maximum iterations in msCG + RealD, action_tolerance, //msCG tolerance in action evaluation + int, action_degree, //rational approx tolerance in action evaluation + RealD, md_tolerance, //msCG tolerance in MD integration + int, md_degree, //rational approx tolerance in MD integration + int, precision, //precision of floating point arithmetic + int, BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check + // constructor + RationalActionParams(int _inv_pow = 2, + RealD _lo = 0.0, + RealD _hi = 1.0, + int _maxit = 1000, + RealD _action_tolerance = 1.0e-8, + int _action_degree = 10, + RealD _md_tolerance = 1.0e-8, + int _md_degree = 10, + int _precision = 64, + int _BoundsCheckFreq=20) + : inv_pow(_inv_pow), + lo(_lo), + hi(_hi), + MaxIter(_maxit), + action_tolerance(_action_tolerance), + action_degree(_action_degree), + md_tolerance(_md_tolerance), + md_degree(_md_degree), + precision(_precision), + BoundsCheckFreq(_BoundsCheckFreq){}; + }; + + NAMESPACE_END(Grid); #endif diff --git a/Grid/qcd/action/pseudofermion/Bounds.h b/Grid/qcd/action/pseudofermion/Bounds.h index b9621f24..8864b1d7 100644 --- a/Grid/qcd/action/pseudofermion/Bounds.h +++ b/Grid/qcd/action/pseudofermion/Bounds.h @@ -65,13 +65,65 @@ NAMESPACE_BEGIN(Grid); X=X-Y; RealD Nd = norm2(X); std::cout << "************************* "< void InversePowerBoundsCheck(int inv_pow, + int MaxIter,double tol, + LinearOperatorBase &HermOp, + Field &GaussNoise, + MultiShiftFunction &ApproxNegPow) + { + GridBase *FermionGrid = GaussNoise.Grid(); + + Field X(FermionGrid); + Field Y(FermionGrid); + Field Z(FermionGrid); + + Field tmp1(FermionGrid), tmp2(FermionGrid); + + X=GaussNoise; + RealD Nx = norm2(X); + + ConjugateGradientMultiShift msCG(MaxIter,ApproxNegPow); + + tmp1 = X; + + Field* in = &tmp1; + Field* out = &tmp2; + for(int i=0;i class ExactOneFlavourRatioPseudoFermionAction : public Action { @@ -57,37 +61,60 @@ NAMESPACE_BEGIN(Grid); bool use_heatbath_forecasting; AbstractEOFAFermion& Lop; // the basic LH operator AbstractEOFAFermion& Rop; // the basic RH operator - SchurRedBlackDiagMooeeSolve SolverHB; + SchurRedBlackDiagMooeeSolve SolverHBL; + SchurRedBlackDiagMooeeSolve SolverHBR; SchurRedBlackDiagMooeeSolve SolverL; SchurRedBlackDiagMooeeSolve SolverR; SchurRedBlackDiagMooeeSolve DerivativeSolverL; SchurRedBlackDiagMooeeSolve DerivativeSolverR; FermionField Phi; // the pseudofermion field for this trajectory + RealD norm2_eta; //|eta|^2 where eta is the random gaussian field used to generate the pseudofermion field + bool initial_action; //true for the first call to S after refresh, for which the identity S = |eta|^2 holds provided the rational approx is good public: + //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator + virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){ + AbstractEOFAFermion&op = LorR == 0 ? Lop : Rop; + op.RefreshShiftCoefficients(to); + } + + + //Use the same solver for L,R in all cases ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion& _Lop, AbstractEOFAFermion& _Rop, OperatorFunction& CG, Params& p, bool use_fc=false) - : ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,p,use_fc) {}; - + : ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,CG,CG,CG,CG,CG,CG,p,use_fc) {}; + + //Use the same solver for L,R in the heatbath but different solvers elsewhere ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion& _Lop, AbstractEOFAFermion& _Rop, - OperatorFunction& HeatbathCG, + OperatorFunction& HeatbathCG, + OperatorFunction& ActionCGL, OperatorFunction& ActionCGR, + OperatorFunction& DerivCGL , OperatorFunction& DerivCGR, + Params& p, + bool use_fc=false) + : ExactOneFlavourRatioPseudoFermionAction(_Lop,_Rop,HeatbathCG,HeatbathCG, ActionCGL, ActionCGR, DerivCGL,DerivCGR,p,use_fc) {}; + + //Use different solvers for L,R in all cases + ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion& _Lop, + AbstractEOFAFermion& _Rop, + OperatorFunction& HeatbathCGL, OperatorFunction& HeatbathCGR, OperatorFunction& ActionCGL, OperatorFunction& ActionCGR, OperatorFunction& DerivCGL , OperatorFunction& DerivCGR, Params& p, bool use_fc=false) : Lop(_Lop), Rop(_Rop), - SolverHB(HeatbathCG,false,true), + SolverHBL(HeatbathCGL,false,true), SolverHBR(HeatbathCGR,false,true), SolverL(ActionCGL, false, true), SolverR(ActionCGR, false, true), DerivativeSolverL(DerivCGL, false, true), DerivativeSolverR(DerivCGR, false, true), Phi(_Lop.FermionGrid()), param(p), - use_heatbath_forecasting(use_fc) + use_heatbath_forecasting(use_fc), + initial_action(false) { AlgRemez remez(param.lo, param.hi, param.precision); @@ -97,6 +124,8 @@ NAMESPACE_BEGIN(Grid); PowerNegHalf.Init(remez, param.tolerance, true); }; + const FermionField &getPhi() const{ return Phi; } + virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; } virtual std::string LogParameters() { @@ -117,6 +146,19 @@ NAMESPACE_BEGIN(Grid); else{ for(int s=0; s sig^2 = 0.5. + // + RealD scale = std::sqrt(0.5); + + FermionField eta (Lop.FermionGrid()); + gaussian(pRNG,eta); eta = eta * scale; + + refresh(U,eta); + } + // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843 // We generate a Gaussian noise vector \eta, and then compute // \Phi = M_{\rm EOFA}^{-1/2} * \eta @@ -124,12 +166,10 @@ NAMESPACE_BEGIN(Grid); // // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta // - virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) - { + void refresh(const GaugeField &U, const FermionField &eta) { Lop.ImportGauge(U); Rop.ImportGauge(U); - FermionField eta (Lop.FermionGrid()); FermionField CG_src (Lop.FermionGrid()); FermionField CG_soln (Lop.FermionGrid()); FermionField Forecast_src(Lop.FermionGrid()); @@ -140,11 +180,6 @@ NAMESPACE_BEGIN(Grid); if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); } ChronoForecast, FermionField> Forecast; - // Seed with Gaussian noise vector (var = 0.5) - RealD scale = std::sqrt(0.5); - gaussian(pRNG,eta); - eta = eta * scale; - // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta RealD N(PowerNegHalf.norm); for(int k=0; k tmp(2, Lop.FermionGrid()); - mPhi = phi; + out = in; // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi> - spProj(Phi, spProj_Phi, -1, Lop.Ls); - Lop.Omega(spProj_Phi, tmp[0], -1, 0); + spProj(in, spProj_in, -1, Lop.Ls); + Lop.Omega(spProj_in, tmp[0], -1, 0); G5R5(tmp[1], tmp[0]); tmp[0] = Zero(); SolverL(Lop, tmp[1], tmp[0]); Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back Lop.Omega(tmp[1], tmp[0], -1, 1); - mPhi = mPhi - Lop.k * innerProduct(spProj_Phi, tmp[0]).real(); + spProj(tmp[0], tmp[1], -1, Lop.Ls); + + out = out - Lop.k * tmp[1]; // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb) - // - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi> - spProj(Phi, spProj_Phi, 1, Rop.Ls); - Rop.Omega(spProj_Phi, tmp[0], 1, 0); + // - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi> + spProj(in, spProj_in, 1, Rop.Ls); + Rop.Omega(spProj_in, tmp[0], 1, 0); G5R5(tmp[1], tmp[0]); tmp[0] = Zero(); SolverR(Rop, tmp[1], tmp[0]); Rop.Dtilde(tmp[0], tmp[1]); Rop.Omega(tmp[1], tmp[0], 1, 1); - action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real(); -#endif + spProj(tmp[0], tmp[1], 1, Rop.Ls); + + out = out + Rop.k * tmp[1]; } + //Due to the structure of EOFA, it is no more expensive to compute the inverse of Meofa + //To ensure correctness we can simply reuse the heatbath code but use the rational approx + //f(x) = 1/x which corresponds to alpha_0=0, alpha_1=1, beta_1=0 => gamma_1=1 + void MeofaInv(const GaugeField &U, const FermionField &in, FermionField &out) { + Lop.ImportGauge(U); + Rop.ImportGauge(U); + + FermionField CG_src (Lop.FermionGrid()); + FermionField CG_soln (Lop.FermionGrid()); + std::vector tmp(2, Lop.FermionGrid()); + + // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta + // = 1 * \eta + out = in; + + // LH terms: + // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf) + // - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta + spProj(in, tmp[0], -1, Lop.Ls); + Lop.Omega(tmp[0], tmp[1], -1, 0); + G5R5(CG_src, tmp[1]); + { + heatbathRefreshShiftCoefficients(0, -1.); //-gamma_1 = -1. + + CG_soln = Zero(); // Just use zero as the initial guess + SolverHBL(Lop, CG_src, CG_soln); + + Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back + tmp[1] = Lop.k * tmp[0]; + } + Lop.Omega(tmp[1], tmp[0], -1, 1); + spProj(tmp[0], tmp[1], -1, Lop.Ls); + out = out + tmp[1]; + + // RH terms: + // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb) + // - \beta_l\gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta + spProj(in, tmp[0], 1, Rop.Ls); + Rop.Omega(tmp[0], tmp[1], 1, 0); + G5R5(CG_src, tmp[1]); + { + heatbathRefreshShiftCoefficients(1, 0.); //-gamma_1 * beta_1 = 0 + + CG_soln = Zero(); + SolverHBR(Rop, CG_src, CG_soln); + + Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back + tmp[1] = - Rop.k * tmp[0]; + } + Rop.Omega(tmp[1], tmp[0], 1, 1); + spProj(tmp[0], tmp[1], 1, Rop.Ls); + out = out + tmp[1]; + + // Reset shift coefficients for energy and force evals + heatbathRefreshShiftCoefficients(0, 0.0); + heatbathRefreshShiftCoefficients(1, -1.0); + }; + + + + // EOFA action: see Eqn. (10) of arXiv:1706.05843 virtual RealD S(const GaugeField& U) { @@ -271,7 +374,7 @@ NAMESPACE_BEGIN(Grid); action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real(); // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb) - // - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi> + // - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} |\Phi> spProj(Phi, spProj_Phi, 1, Rop.Ls); Rop.Omega(spProj_Phi, tmp[0], 1, 0); G5R5(tmp[1], tmp[0]); @@ -281,6 +384,26 @@ NAMESPACE_BEGIN(Grid); Rop.Omega(tmp[1], tmp[0], 1, 1); action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real(); + if(initial_action){ + //For the first call to S after refresh, S = |eta|^2. We can use this to ensure the rational approx is good + RealD diff = action - norm2_eta; + + //S_init = eta^dag M^{-1/2} M M^{-1/2} eta + //S_init - eta^dag eta = eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta + + //If approximate solution + //S_init - eta^dag eta = eta^dag ( [M^{-1/2}+\delta M^{-1/2}] M [M^{-1/2}+\delta M^{-1/2}] - 1 ) eta + // \approx eta^dag ( \delta M^{-1/2} M^{1/2} + M^{1/2}\delta M^{-1/2} ) eta + // We divide out |eta|^2 to remove source scaling but the tolerance on this check should still be somewhat higher than the actual approx tolerance + RealD test = fabs(diff)/norm2_eta; //test the quality of the rational approx + + std::cout << GridLogMessage << action_name() << " initial action " << action << " expect " << norm2_eta << "; diff " << diff << std::endl; + std::cout << GridLogMessage << action_name() << "[ eta^dag ( M^{-1/2} M M^{-1/2} - 1 ) eta ]/|eta^2| = " << test << " expect 0 (tol " << param.BoundsCheckTol << ")" << std::endl; + + assert( ( test < param.BoundsCheckTol ) && " Initial action check failed" ); + initial_action = false; + } + return action; }; @@ -329,6 +452,40 @@ NAMESPACE_BEGIN(Grid); }; }; + template + class ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction : public ExactOneFlavourRatioPseudoFermionAction{ + public: + INHERIT_IMPL_TYPES(ImplD); + typedef OneFlavourRationalParams Params; + + private: + AbstractEOFAFermion& LopF; // the basic LH operator + AbstractEOFAFermion& RopF; // the basic RH operator + + public: + + virtual std::string action_name() { return "ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction"; } + + //Used in the heatbath, refresh the shift coefficients of the L (LorR=0) or R (LorR=1) operator + virtual void heatbathRefreshShiftCoefficients(int LorR, RealD to){ + AbstractEOFAFermion &op = LorR == 0 ? LopF : RopF; + op.RefreshShiftCoefficients(to); + this->ExactOneFlavourRatioPseudoFermionAction::heatbathRefreshShiftCoefficients(LorR,to); + } + + ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction(AbstractEOFAFermion& _LopF, + AbstractEOFAFermion& _RopF, + AbstractEOFAFermion& _LopD, + AbstractEOFAFermion& _RopD, + OperatorFunction& HeatbathCGL, OperatorFunction& HeatbathCGR, + OperatorFunction& ActionCGL, OperatorFunction& ActionCGR, + OperatorFunction& DerivCGL , OperatorFunction& DerivCGR, + Params& p, + bool use_fc=false) : + LopF(_LopF), RopF(_RopF), ExactOneFlavourRatioPseudoFermionAction(_LopD, _RopD, HeatbathCGL, HeatbathCGR, ActionCGL, ActionCGR, DerivCGL, DerivCGR, p, use_fc){} + }; + + NAMESPACE_END(Grid); #endif diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h new file mode 100644 index 00000000..2b08cf49 --- /dev/null +++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h @@ -0,0 +1,372 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h + + Copyright (C) 2015 + + Author: Christopher Kelly + Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H +#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_H + +NAMESPACE_BEGIN(Grid); + + ///////////////////////////////////////////////////////// + // Generic rational approximation for ratios of operators + ///////////////////////////////////////////////////////// + + /* S_f = -log( det( [M^dag M]/[V^dag V] )^{1/inv_pow} ) + = chi^dag ( [M^dag M]/[V^dag V] )^{-1/inv_pow} chi\ + = chi^dag ( [V^dag V]^{-1/2} [M^dag M] [V^dag V]^{-1/2} )^{-1/inv_pow} chi\ + = chi^dag [V^dag V]^{1/(2*inv_pow)} [M^dag M]^{-1/inv_pow} [V^dag V]^{1/(2*inv_pow)} chi\ + + S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi + + BIG WARNING: + Here V^dag V is referred to in this code as the "numerator" operator and M^dag M is the *denominator* operator. + this refers to their position in the pseudofermion action, which is the *inverse* of what appears in the determinant + Thus for DWF the numerator operator is the Pauli-Villars operator + + Here P/Q \sim R_{1/(2*inv_pow)} ~ (V^dagV)^{1/(2*inv_pow)} + Here N/D \sim R_{-1/inv_pow} ~ (M^dagM)^{-1/inv_pow} + */ + + template + class GeneralEvenOddRatioRationalPseudoFermionAction : public Action { + public: + + INHERIT_IMPL_TYPES(Impl); + + typedef RationalActionParams Params; + Params param; + + //For action evaluation + MultiShiftFunction ApproxPowerAction ; //rational approx for X^{1/inv_pow} + MultiShiftFunction ApproxNegPowerAction; //rational approx for X^{-1/inv_pow} + MultiShiftFunction ApproxHalfPowerAction; //rational approx for X^{1/(2*inv_pow)} + MultiShiftFunction ApproxNegHalfPowerAction; //rational approx for X^{-1/(2*inv_pow)} + + //For the MD integration + MultiShiftFunction ApproxPowerMD ; //rational approx for X^{1/inv_pow} + MultiShiftFunction ApproxNegPowerMD; //rational approx for X^{-1/inv_pow} + MultiShiftFunction ApproxHalfPowerMD; //rational approx for X^{1/(2*inv_pow)} + MultiShiftFunction ApproxNegHalfPowerMD; //rational approx for X^{-1/(2*inv_pow)} + + private: + + FermionOperator & NumOp;// the basic operator + FermionOperator & DenOp;// the basic operator + FermionField PhiEven; // the pseudo fermion field for this trajectory + FermionField PhiOdd; // the pseudo fermion field for this trajectory + + //Generate the approximation to x^{1/inv_pow} (->approx) and x^{-1/inv_pow} (-> approx_inv) by an approx_degree degree rational approximation + //CG_tolerance is used to issue a warning if the approximation error is larger than the tolerance of the CG and is otherwise just stored in the MultiShiftFunction for use by the multi-shift + static void generateApprox(MultiShiftFunction &approx, MultiShiftFunction &approx_inv, int inv_pow, int approx_degree, double CG_tolerance, AlgRemez &remez){ + std::cout< CG_tolerance) + std::cout< schurOp(numerator ? NumOp : DenOp); + ConjugateGradientMultiShift msCG(MaxIter, approx); + msCG(schurOp,in, out); + } + virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionField &in, std::vector &out_elems, FermionField &out){ + SchurDifferentiableOperator schurOp(numerator ? NumOp : DenOp); + ConjugateGradientMultiShift msCG(MaxIter, approx); + msCG(schurOp,in, out_elems, out); + } + //Allow derived classes to override the gauge import + virtual void ImportGauge(const GaugeField &U){ + NumOp.ImportGauge(U); + DenOp.ImportGauge(U); + } + + public: + + GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator &_NumOp, + FermionOperator &_DenOp, + const Params & p + ) : + NumOp(_NumOp), + DenOp(_DenOp), + PhiOdd (_NumOp.FermionRedBlackGrid()), + PhiEven(_NumOp.FermionRedBlackGrid()), + param(p) + { + std::cout<Broadcast(0,r); + + if ( param.BoundsCheckFreq != 0 && (r % param.BoundsCheckFreq)==0 ) { + std::cout< MdagM(DenOp); + std::cout< MpvPhi_k (n_pv,NumOp.FermionRedBlackGrid()); + std::vector MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid()); + std::vector MfMpvPhi_k (n_f ,NumOp.FermionRedBlackGrid()); + + FermionField MpvPhi(NumOp.FermionRedBlackGrid()); + FermionField MfMpvPhi(NumOp.FermionRedBlackGrid()); + FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid()); + FermionField Y(NumOp.FermionRedBlackGrid()); + + GaugeField tmp(NumOp.GaugeGrid()); + + ImportGauge(U); + + std::cout< MdagM(DenOp); + SchurDifferentiableOperator VdagV(NumOp); + + + RealD ak; + + dSdU = Zero(); + + // With these building blocks + // + // dS/dU = + // \sum_k -ak MfMpvPhi_k^dag [ dM^dag M + M^dag dM ] MfMpvPhi_k (1) + // + \sum_k -ak MpvMfMpvPhi_k^\dag [ dV^dag V + V^dag dV ] MpvPhi_k (2) + // -ak MpvPhi_k^dag [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k (3) + + //(1) + std::cout< + Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H +#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H + +NAMESPACE_BEGIN(Grid); + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Generic rational approximation for ratios of operators utilizing the mixed precision multishift algorithm + // cf. GeneralEvenOddRational.h for details + ///////////////////////////////////////////////////////////////////////////////////////////////////////////// + + template + class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction { + private: + typedef typename ImplD::FermionField FermionFieldD; + typedef typename ImplF::FermionField FermionFieldF; + + FermionOperator & NumOpD; + FermionOperator & DenOpD; + + FermionOperator & NumOpF; + FermionOperator & DenOpF; + + Integer ReliableUpdateFreq; + protected: + + //Allow derived classes to override the multishift CG + virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){ + SchurDifferentiableOperator schurOpD(numerator ? NumOpD : DenOpD); + SchurDifferentiableOperator schurOpF(numerator ? NumOpF : DenOpF); + + ConjugateGradientMultiShiftMixedPrec msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); + msCG(schurOpD, in, out); + } + virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector &out_elems, FermionFieldD &out){ + SchurDifferentiableOperator schurOpD(numerator ? NumOpD : DenOpD); + SchurDifferentiableOperator schurOpF(numerator ? NumOpF : DenOpF); + + ConjugateGradientMultiShiftMixedPrec msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); + msCG(schurOpD, in, out_elems, out); + } + //Allow derived classes to override the gauge import + virtual void ImportGauge(const typename ImplD::GaugeField &Ud){ + typename ImplF::GaugeField Uf(NumOpF.GaugeGrid()); + precisionChange(Uf, Ud); + + NumOpD.ImportGauge(Ud); + DenOpD.ImportGauge(Ud); + + NumOpF.ImportGauge(Uf); + DenOpF.ImportGauge(Uf); + } + + public: + GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator &_NumOpD, FermionOperator &_DenOpD, + FermionOperator &_NumOpF, FermionOperator &_DenOpF, + const RationalActionParams & p, Integer _ReliableUpdateFreq + ) : GeneralEvenOddRatioRationalPseudoFermionAction(_NumOpD, _DenOpD, p), + ReliableUpdateFreq(_ReliableUpdateFreq), NumOpD(_NumOpD), DenOpD(_DenOpD), NumOpF(_NumOpF), DenOpF(_DenOpF){} + + virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";} + }; + +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h index 6752ea19..1b36ae0f 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h @@ -40,257 +40,31 @@ NAMESPACE_BEGIN(Grid); // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2} template - class OneFlavourEvenOddRatioRationalPseudoFermionAction : public Action { + class OneFlavourEvenOddRatioRationalPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction { public: - - INHERIT_IMPL_TYPES(Impl); - typedef OneFlavourRationalParams Params; - Params param; - - MultiShiftFunction PowerHalf ; - MultiShiftFunction PowerNegHalf; - MultiShiftFunction PowerQuarter; - MultiShiftFunction PowerNegQuarter; - private: - - FermionOperator & NumOp;// the basic operator - FermionOperator & DenOp;// the basic operator - FermionField PhiEven; // the pseudo fermion field for this trajectory - FermionField PhiOdd; // the pseudo fermion field for this trajectory - FermionField Noise; // spare noise field for bounds check + static RationalActionParams transcribe(const Params &in){ + RationalActionParams out; + out.inv_pow = 2; + out.lo = in.lo; + out.hi = in.hi; + out.MaxIter = in.MaxIter; + out.action_tolerance = out.md_tolerance = in.tolerance; + out.action_degree = out.md_degree = in.degree; + out.precision = in.precision; + out.BoundsCheckFreq = in.BoundsCheckFreq; + return out; + } public: - OneFlavourEvenOddRatioRationalPseudoFermionAction(FermionOperator &_NumOp, - FermionOperator &_DenOp, - Params & p - ) : - NumOp(_NumOp), - DenOp(_DenOp), - PhiOdd (_NumOp.FermionRedBlackGrid()), - PhiEven(_NumOp.FermionRedBlackGrid()), - Noise(_NumOp.FermionRedBlackGrid()), - param(p) - { - AlgRemez remez(param.lo,param.hi,param.precision); + FermionOperator &_DenOp, + const Params & p + ) : + GeneralEvenOddRatioRationalPseudoFermionAction(_NumOp, _DenOp, transcribe(p)){} - // MdagM^(+- 1/2) - std::cout< sig^2 = 0.5. - // - // So eta should be of width sig = 1/sqrt(2). - - RealD scale = std::sqrt(0.5); - - FermionField eta(NumOp.FermionGrid()); - FermionField etaOdd (NumOp.FermionRedBlackGrid()); - FermionField etaEven(NumOp.FermionRedBlackGrid()); - FermionField tmp(NumOp.FermionRedBlackGrid()); - - gaussian(pRNG,eta); eta=eta*scale; - - pickCheckerboard(Even,etaEven,eta); - pickCheckerboard(Odd,etaOdd,eta); - - Noise = etaOdd; - NumOp.ImportGauge(U); - DenOp.ImportGauge(U); - - - // MdagM^1/4 eta - SchurDifferentiableOperator MdagM(DenOp); - ConjugateGradientMultiShift msCG_M(param.MaxIter,PowerQuarter); - msCG_M(MdagM,etaOdd,tmp); - - // VdagV^-1/4 MdagM^1/4 eta - SchurDifferentiableOperator VdagV(NumOp); - ConjugateGradientMultiShift msCG_V(param.MaxIter,PowerNegQuarter); - msCG_V(VdagV,tmp,PhiOdd); - - assert(NumOp.ConstEE() == 1); - assert(DenOp.ConstEE() == 1); - PhiEven = Zero(); - - }; - - ////////////////////////////////////////////////////// - // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi - ////////////////////////////////////////////////////// - virtual RealD S(const GaugeField &U) { - - NumOp.ImportGauge(U); - DenOp.ImportGauge(U); - - FermionField X(NumOp.FermionRedBlackGrid()); - FermionField Y(NumOp.FermionRedBlackGrid()); - - // VdagV^1/4 Phi - SchurDifferentiableOperator VdagV(NumOp); - ConjugateGradientMultiShift msCG_V(param.MaxIter,PowerQuarter); - msCG_V(VdagV,PhiOdd,X); - - // MdagM^-1/4 VdagV^1/4 Phi - SchurDifferentiableOperator MdagM(DenOp); - ConjugateGradientMultiShift msCG_M(param.MaxIter,PowerNegQuarter); - msCG_M(MdagM,X,Y); - - // Randomly apply rational bounds checks. - auto grid = NumOp.FermionGrid(); - auto r=rand(); - grid->Broadcast(0,r); - if ( (r%param.BoundsCheckFreq)==0 ) { - FermionField gauss(NumOp.FermionRedBlackGrid()); - gauss = Noise; - HighBoundCheck(MdagM,gauss,param.hi); - InverseSqrtBoundsCheck(param.MaxIter,param.tolerance*100,MdagM,gauss,PowerNegHalf); - ChebyBoundsCheck(MdagM,Noise,param.lo,param.hi); - } - - // Phidag VdagV^1/4 MdagM^-1/4 MdagM^-1/4 VdagV^1/4 Phi - RealD action = norm2(Y); - - return action; - }; - - // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi - // - // Here, M is some 5D operator and V is the Pauli-Villars field - // N and D makeup the rat. poly of the M term and P and & makeup the rat.poly of the denom term - // - // Need - // dS_f/dU = chi^dag d[P/Q] N/D P/Q chi - // + chi^dag P/Q d[N/D] P/Q chi - // + chi^dag P/Q N/D d[P/Q] chi - // - // P/Q is expressed as partial fraction expansion: - // - // a0 + \sum_k ak/(V^dagV + bk) - // - // d[P/Q] is then - // - // \sum_k -ak [V^dagV+bk]^{-1} [ dV^dag V + V^dag dV ] [V^dag V + bk]^{-1} - // - // and similar for N/D. - // - // Need - // MpvPhi_k = [Vdag V + bk]^{-1} chi - // MpvPhi = {a0 + \sum_k ak [Vdag V + bk]^{-1} }chi - // - // MfMpvPhi_k = [MdagM+bk]^{-1} MpvPhi - // MfMpvPhi = {a0 + \sum_k ak [Mdag M + bk]^{-1} } MpvPhi - // - // MpvMfMpvPhi_k = [Vdag V + bk]^{-1} MfMpvchi - // - - virtual void deriv(const GaugeField &U,GaugeField & dSdU) { - - const int n_f = PowerNegHalf.poles.size(); - const int n_pv = PowerQuarter.poles.size(); - - std::vector MpvPhi_k (n_pv,NumOp.FermionRedBlackGrid()); - std::vector MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid()); - std::vector MfMpvPhi_k (n_f ,NumOp.FermionRedBlackGrid()); - - FermionField MpvPhi(NumOp.FermionRedBlackGrid()); - FermionField MfMpvPhi(NumOp.FermionRedBlackGrid()); - FermionField MpvMfMpvPhi(NumOp.FermionRedBlackGrid()); - FermionField Y(NumOp.FermionRedBlackGrid()); - - GaugeField tmp(NumOp.GaugeGrid()); - - NumOp.ImportGauge(U); - DenOp.ImportGauge(U); - - SchurDifferentiableOperator VdagV(NumOp); - SchurDifferentiableOperator MdagM(DenOp); - - ConjugateGradientMultiShift msCG_V(param.MaxIter,PowerQuarter); - ConjugateGradientMultiShift msCG_M(param.MaxIter,PowerNegHalf); - - msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi); - msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi); - msCG_V(VdagV,MfMpvPhi,MpvMfMpvPhi_k,MpvMfMpvPhi); - - RealD ak; - - dSdU = Zero(); - - // With these building blocks - // - // dS/dU = - // \sum_k -ak MfMpvPhi_k^dag [ dM^dag M + M^dag dM ] MfMpvPhi_k (1) - // + \sum_k -ak MpvMfMpvPhi_k^\dag [ dV^dag V + V^dag dV ] MpvPhi_k (2) - // -ak MpvPhi_k^dag [ dV^dag V + V^dag dV ] MpvMfMpvPhi_k (3) - - //(1) - for(int k=0;k #include #include +#include +#include #include #include diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h index 6d06e090..56dd6840 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h @@ -88,15 +88,9 @@ NAMESPACE_BEGIN(Grid); } - virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { + const FermionField &getPhiOdd() const{ return PhiOdd; } - // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi} - // - // NumOp == V - // DenOp == M - // - // Take phi_o = Vpcdag^{-1} Mpcdag eta_o ; eta_o = Mpcdag^{-1} Vpcdag Phi - // + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // P(eta_o) = e^{- eta_o^dag eta_o} // // e^{x^2/2 sig^2} => sig^2 = 0.5. @@ -104,12 +98,22 @@ NAMESPACE_BEGIN(Grid); RealD scale = std::sqrt(0.5); FermionField eta (NumOp.FermionGrid()); + gaussian(pRNG,eta); eta = eta * scale; + + refresh(U,eta); + } + + void refresh(const GaugeField &U, const FermionField &eta) { + + // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi} + // + // NumOp == V + // DenOp == M + // FermionField etaOdd (NumOp.FermionRedBlackGrid()); FermionField etaEven(NumOp.FermionRedBlackGrid()); FermionField tmp (NumOp.FermionRedBlackGrid()); - gaussian(pRNG,eta); - pickCheckerboard(Even,etaEven,eta); pickCheckerboard(Odd,etaOdd,eta); @@ -128,10 +132,6 @@ NAMESPACE_BEGIN(Grid); // Even det factors DenOp.MooeeDag(etaEven,tmp); NumOp.MooeeInvDag(tmp,PhiEven); - - PhiOdd =PhiOdd*scale; - PhiEven=PhiEven*scale; - }; ////////////////////////////////////////////////////// diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h index 098f8f22..727b3e24 100644 --- a/Grid/qcd/hmc/GenericHMCrunner.h +++ b/Grid/qcd/hmc/GenericHMCrunner.h @@ -151,12 +151,22 @@ public: Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U, Resources.GetSerialRNG(), Resources.GetParallelRNG()); + } else if (Parameters.StartingType == "CheckpointStartReseed") { + // Same as CheckpointRestart but reseed the RNGs using the fixed integer seeding used for ColdStart and HotStart + // Useful for creating new evolution streams from an existing stream + + // WARNING: Unfortunately because the checkpointer doesn't presently allow us to separately restore the RNG and gauge fields we have to load + // an existing RNG checkpoint first; make sure one is available and named correctly + Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U, + Resources.GetSerialRNG(), + Resources.GetParallelRNG()); + Resources.SeedFixedIntegers(); } else { // others std::cout << GridLogError << "Unrecognized StartingType\n"; std::cout << GridLogError - << "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + << "Valid [HotStart, ColdStart, TepidStart, CheckpointStart, CheckpointStartReseed]\n"; exit(1); } } diff --git a/Grid/qcd/hmc/HMCModules.h b/Grid/qcd/hmc/HMCModules.h index 4c61a006..cf0edd26 100644 --- a/Grid/qcd/hmc/HMCModules.h +++ b/Grid/qcd/hmc/HMCModules.h @@ -80,7 +80,9 @@ public: std::cout << GridLogError << "Seeds not initialized" << std::endl; exit(1); } + std::cout << GridLogMessage << "Reseeding serial RNG with seed vector " << SerialSeeds << std::endl; sRNG_.SeedFixedIntegers(SerialSeeds); + std::cout << GridLogMessage << "Reseeding parallel RNG with seed vector " << ParallelSeeds << std::endl; pRNG_->SeedFixedIntegers(ParallelSeeds); } }; diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 070cbea1..9563698c 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -334,15 +334,19 @@ public: void refresh(Field& U, GridSerialRNG & sRNG, GridParallelRNG& pRNG) { assert(P.Grid() == U.Grid()); - std::cout << GridLogIntegrator << "Integrator refresh\n"; + std::cout << GridLogIntegrator << "Integrator refresh" << std::endl; + std::cout << GridLogIntegrator << "Generating momentum" << std::endl; FieldImplementation::generate_momenta(P, sRNG, pRNG); // Update the smeared fields, can be implemented as observer // necessary to keep the fields updated even after a reject // of the Metropolis + std::cout << GridLogIntegrator << "Updating smeared fields" << std::endl; Smearer.set_Field(U); // Set the (eventual) representations gauge fields + + std::cout << GridLogIntegrator << "Updating representations" << std::endl; Representations.update(U); // The Smearer is attached to a pointer of the gauge field diff --git a/tests/forces/Test_gpdwf_force_1f_2f.cc b/tests/forces/Test_gpdwf_force_1f_2f.cc new file mode 100644 index 00000000..7e14eb08 --- /dev/null +++ b/tests/forces/Test_gpdwf_force_1f_2f.cc @@ -0,0 +1,446 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./forces/Test_gpdwf_force_1f_2f.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +//Here we test the G-parity action and force between the 1f (doubled-lattice) and 2f approaches + + +void copyConjGauge(LatticeGaugeFieldD &Umu_1f, const LatticeGaugeFieldD &Umu_2f, const int nu){ + GridBase* UGrid_2f = Umu_2f.Grid(); + GridBase* UGrid_1f = Umu_1f.Grid(); + + Replicate(Umu_2f,Umu_1f); + + int L_2f = UGrid_2f->FullDimensions()[nu]; + int L_1f = UGrid_1f->FullDimensions()[nu]; + assert(L_1f == 2 * L_2f); + + //Coordinate grid for reference + LatticeInteger xcoor_1f(UGrid_1f); + LatticeCoordinate(xcoor_1f,nu); + + //Copy-conjugate the gauge field + //First C-shift the lattice by Lx/2 + { + LatticeGaugeField Umu_shift = conjugate( Cshift(Umu_1f,nu,L_2f) ); + Umu_1f = where( xcoor_1f >= Integer(L_2f), Umu_shift, Umu_1f ); + + //We use the in built APBC + //Make the gauge field antiperiodic in nu-direction + //decltype(PeekIndex(Umu_1f,nu)) Unu(UGrid_1f); + //Unu = PeekIndex(Umu_1f,nu); + //Unu = where(xcoor_1f == Integer(2*L_2f-1), -Unu, Unu); + //PokeIndex(Umu_1f,Unu,nu); + } +} + +template +void convertFermion1f_from_2f(FermionField1f &out_1f, const FermionField2f &in_2f, const int nu, bool is_4d){ + GridBase* FGrid_1f = out_1f.Grid(); + GridBase* FGrid_2f = in_2f.Grid(); + + int nuoff = is_4d ? 0 : 1; //s in 0 direction + + int L_2f = FGrid_2f->FullDimensions()[nu+nuoff]; + int L_1f = FGrid_1f->FullDimensions()[nu+nuoff]; + assert(L_1f == 2 * L_2f); + + auto in_f0_2fgrid = PeekIndex(in_2f,0); //flavor 0 on 2f Grid + FermionField1f in_f0_1fgrid(FGrid_1f); + Replicate(in_f0_2fgrid, in_f0_1fgrid); //has flavor 0 on both halves + + auto in_f1_2fgrid = PeekIndex(in_2f,1); //flavor 1 on 2f Grid + FermionField1f in_f1_1fgrid(FGrid_1f); + Replicate(in_f1_2fgrid, in_f1_1fgrid); //has flavor 1 on both halves + + LatticeInteger xcoor_1f(FGrid_1f); + LatticeCoordinate(xcoor_1f,nu+nuoff); + + out_1f = where(xcoor_1f < L_2f, in_f0_1fgrid, in_f1_1fgrid); +} + +template +class RatioActionSetupBase{ +protected: + TwoFlavourEvenOddRatioPseudoFermionAction *pf_1f; + TwoFlavourEvenOddRatioPseudoFermionAction *pf_2f; + + GparityAction* action_2f; + GparityAction* action_PV_2f; + StandardAction* action_1f; + StandardAction* action_PV_1f; + + ConjugateGradient CG_1f; + ConjugateGradient CG_2f; + + RatioActionSetupBase(): CG_1f(1.0e-8,10000), CG_2f(1.0e-8,10000){} + + void setupPseudofermion(){ + pf_1f = new TwoFlavourEvenOddRatioPseudoFermionAction(*action_PV_1f, *action_1f, CG_1f, CG_1f); + pf_2f = new TwoFlavourEvenOddRatioPseudoFermionAction(*action_PV_2f, *action_2f, CG_2f, CG_2f); + } + +public: + GparityAction & action2f(){ return *action_2f; } + StandardAction & action1f(){ return *action_1f; } + + void refreshAction(LatticeGaugeField &Umu_2f, typename GparityAction::FermionField &eta_2f, + LatticeGaugeField &Umu_1f, typename StandardAction::FermionField &eta_1f){ + pf_1f->refresh(Umu_1f, eta_1f); + pf_2f->refresh(Umu_2f, eta_2f); + + //Compare PhiOdd + RealD norm_1f = norm2(pf_1f->getPhiOdd()); + RealD norm_2f = norm2(pf_2f->getPhiOdd()); + + std::cout << "Test PhiOdd 2f: " << norm_2f << " 1f: " << norm_1f << std::endl; + } + + void computeAction(RealD &S_2f, RealD &S_1f, LatticeGaugeField &Umu_2f, LatticeGaugeField &Umu_1f){ + S_1f = pf_1f->S(Umu_1f); + S_2f = pf_2f->S(Umu_2f); + } + + void computeDeriv(LatticeGaugeField &deriv_2f, LatticeGaugeField &deriv_1f, LatticeGaugeField &Umu_2f, LatticeGaugeField &Umu_1f){ + pf_1f->deriv(Umu_1f, deriv_1f); + pf_2f->deriv(Umu_2f, deriv_2f); + } + +}; + + + + +template +struct setupAction{}; + +template<> +struct setupAction: public RatioActionSetupBase{ + typedef GparityWilsonTMFermionD GparityAction; + typedef WilsonTMFermionD StandardAction; + + setupAction(GridCartesian* UGrid_2f, GridRedBlackCartesian* UrbGrid_2f, GridCartesian* FGrid_2f, GridRedBlackCartesian* FrbGrid_2f, + GridCartesian* UGrid_1f, GridRedBlackCartesian* UrbGrid_1f, GridCartesian* FGrid_1f, GridRedBlackCartesian* FrbGrid_1f, + LatticeGaugeField &Umu_2f, LatticeGaugeField &Umu_1f, int nu): RatioActionSetupBase(){ + RealD mass=-1.8; + //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf + RealD epsilon_f = 0.02; //numerator (in determinant) + RealD epsilon_b = 0.5; + + std::vector twists(Nd,0); + twists[nu] = 1; //GPBC in y + twists[3] = 1; //APBC + GparityAction::ImplParams params_2f; params_2f.twists = twists; + action_2f = new GparityWilsonTMFermionD(Umu_2f,*UGrid_2f,*UrbGrid_2f, mass, epsilon_f, params_2f); + action_PV_2f = new GparityWilsonTMFermionD(Umu_2f,*UGrid_2f,*UrbGrid_2f, mass, epsilon_b, params_2f); + + DomainWallFermionD::ImplParams params_1f; + params_1f.boundary_phases[nu] = -1; + params_1f.boundary_phases[3] = -1; + + action_1f = new WilsonTMFermionD(Umu_1f,*UGrid_1f,*UrbGrid_1f, mass, epsilon_f, params_1f); + action_PV_1f = new WilsonTMFermionD(Umu_1f,*UGrid_1f,*UrbGrid_1f, mass, epsilon_b, params_1f); + + setupPseudofermion(); + } + + static bool is4d(){ return true; } +}; + + +template<> +struct setupAction: public RatioActionSetupBase{ + typedef GparityDomainWallFermionD GparityAction; + typedef DomainWallFermionD StandardAction; + + setupAction(GridCartesian* UGrid_2f, GridRedBlackCartesian* UrbGrid_2f, GridCartesian* FGrid_2f, GridRedBlackCartesian* FrbGrid_2f, + GridCartesian* UGrid_1f, GridRedBlackCartesian* UrbGrid_1f, GridCartesian* FGrid_1f, GridRedBlackCartesian* FrbGrid_1f, + LatticeGaugeField &Umu_2f, LatticeGaugeField &Umu_1f, int nu): RatioActionSetupBase(){ + RealD mass=0.01; + RealD M5=1.8; + + std::vector twists(Nd,0); + twists[nu] = 1; //GPBC in y + twists[3] = 1; //APBC + GparityDomainWallFermionD::ImplParams params_2f; params_2f.twists = twists; + action_2f = new GparityDomainWallFermionD(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5,params_2f); + action_PV_2f = new GparityDomainWallFermionD(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,1.0,M5,params_2f); + + DomainWallFermionD::ImplParams params_1f; + params_1f.boundary_phases[nu] = -1; + params_1f.boundary_phases[3] = -1; + + action_1f = new DomainWallFermionD(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5,params_1f); + action_PV_1f = new DomainWallFermionD(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,1.0,M5,params_1f); + + setupPseudofermion(); + } + + static bool is4d(){ return false; } +}; + + + + + +//For EOFA we need a different pseudofermion type +template<> +struct setupAction{ + typedef GparityDomainWallEOFAFermionD GparityAction; + typedef DomainWallEOFAFermionD StandardAction; + + ExactOneFlavourRatioPseudoFermionAction *pf_1f; + ExactOneFlavourRatioPseudoFermionAction *pf_2f; + + GparityAction* action_2f; + GparityAction* action_PV_2f; + StandardAction* action_1f; + StandardAction* action_PV_1f; + + ConjugateGradient CG_1f; + ConjugateGradient CG_2f; + +public: + GparityAction & action2f(){ return *action_2f; } + StandardAction & action1f(){ return *action_1f; } + + void refreshAction(LatticeGaugeField &Umu_2f, typename GparityAction::FermionField &eta_2f, + LatticeGaugeField &Umu_1f, typename StandardAction::FermionField &eta_1f){ + pf_1f->refresh(Umu_1f, eta_1f); + pf_2f->refresh(Umu_2f, eta_2f); + + //Compare PhiOdd + RealD norm_1f = norm2(pf_1f->getPhi()); + RealD norm_2f = norm2(pf_2f->getPhi()); + + std::cout << "Test Phi 2f: " << norm_2f << " 1f: " << norm_1f << std::endl; + } + + void computeAction(RealD &S_2f, RealD &S_1f, LatticeGaugeField &Umu_2f, LatticeGaugeField &Umu_1f){ + S_1f = pf_1f->S(Umu_1f); + S_2f = pf_2f->S(Umu_2f); + } + + void computeDeriv(LatticeGaugeField &deriv_2f, LatticeGaugeField &deriv_1f, LatticeGaugeField &Umu_2f, LatticeGaugeField &Umu_1f){ + pf_1f->deriv(Umu_1f, deriv_1f); + pf_2f->deriv(Umu_2f, deriv_2f); + } + + + setupAction(GridCartesian* UGrid_2f, GridRedBlackCartesian* UrbGrid_2f, GridCartesian* FGrid_2f, GridRedBlackCartesian* FrbGrid_2f, + GridCartesian* UGrid_1f, GridRedBlackCartesian* UrbGrid_1f, GridCartesian* FGrid_1f, GridRedBlackCartesian* FrbGrid_1f, + LatticeGaugeField &Umu_2f, LatticeGaugeField &Umu_1f, int nu): CG_1f(1.0e-8,10000), CG_2f(1.0e-8,10000){ + RealD mass=0.01; + RealD M5=1.8; + + std::vector twists(Nd,0); + twists[nu] = 1; //GPBC in y + twists[3] = 1; //APBC + GparityAction::ImplParams params_2f; params_2f.twists = twists; + action_2f = new GparityAction(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f, mass, mass, 1.0, 0.0, -1, M5, params_2f); + action_PV_2f = new GparityAction(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f, 1.0, mass, 1.0, -1.0, 1, M5, params_2f); //cf Test_dwf_gpforce_eofa.cc + + StandardAction::ImplParams params_1f; + params_1f.boundary_phases[nu] = -1; + params_1f.boundary_phases[3] = -1; + + action_1f = new StandardAction(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f, mass, mass, 1.0, 0.0, -1, M5, params_1f); + action_PV_1f = new StandardAction(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f, 1.0, mass, 1.0, -1.0, 1, M5, params_1f); + + OneFlavourRationalParams RationalParams(0.95, 100.0, 5000, 1.0e-12, 12); + + pf_1f = new ExactOneFlavourRatioPseudoFermionAction(*action_1f, *action_PV_1f, CG_1f, CG_1f, CG_1f, CG_1f, CG_1f, RationalParams, true); + pf_2f = new ExactOneFlavourRatioPseudoFermionAction(*action_2f, *action_PV_2f, CG_2f, CG_2f, CG_2f, CG_2f, CG_2f, RationalParams, true); + } + + static bool is4d(){ return false; } +}; + + +template +void runTest(int argc, char** argv){ + Grid_init(&argc,&argv); + + const int nu = 1; + Coordinate latt_2f = GridDefaultLatt(); + Coordinate latt_1f = latt_2f; + latt_1f[nu] *= 2; + + Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + const int Ls=8; + + GridCartesian * UGrid_1f = SpaceTimeGrid::makeFourDimGrid(latt_1f, simd_layout, mpi_layout); + GridRedBlackCartesian * UrbGrid_1f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_1f); + GridCartesian * FGrid_1f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_1f); + GridRedBlackCartesian * FrbGrid_1f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_1f); + + + GridCartesian * UGrid_2f = SpaceTimeGrid::makeFourDimGrid(latt_2f, simd_layout, mpi_layout); + GridRedBlackCartesian * UrbGrid_2f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_2f); + GridCartesian * FGrid_2f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_2f); + GridRedBlackCartesian * FrbGrid_2f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_2f); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5_2f(FGrid_2f); RNG5_2f.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4_2f(UGrid_2f); RNG4_2f.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu_2f(UGrid_2f); + SU::HotConfiguration(RNG4_2f,Umu_2f); + + LatticeGaugeField Umu_1f(UGrid_1f); + copyConjGauge(Umu_1f, Umu_2f, nu); + + typedef typename GparityAction::FermionField GparityFermionField; + typedef typename StandardAction::FermionField StandardFermionField; + + setupAction setup(UGrid_2f, UrbGrid_2f, FGrid_2f, FrbGrid_2f, + UGrid_1f, UrbGrid_1f, FGrid_1f, FrbGrid_1f, + Umu_2f, Umu_1f, nu); + GridBase* FGrid_2f_a = setup.action2f().FermionGrid(); + GridBase* FGrid_1f_a = setup.action1f().FermionGrid(); + GridBase* FrbGrid_2f_a = setup.action2f().FermionRedBlackGrid(); + GridBase* FrbGrid_1f_a = setup.action1f().FermionRedBlackGrid(); + bool is_4d = setup.is4d(); + + //Check components by doing an inversion + { + setup.action2f().ImportGauge(Umu_2f); + setup.action1f().ImportGauge(Umu_1f); + + GparityFermionField src_2f(FGrid_2f_a); + gaussian(is_4d ? RNG4_2f : RNG5_2f, src_2f); + + StandardFermionField src_1f(FGrid_1f_a); + convertFermion1f_from_2f(src_1f, src_2f, nu, is_4d); + + StandardFermionField src_o_1f(FrbGrid_1f_a); + StandardFermionField result_o_1f(FrbGrid_1f_a); + pickCheckerboard(Odd,src_o_1f,src_1f); + result_o_1f=Zero(); + + SchurDiagMooeeOperator HermOpEO_1f(setup.action1f()); + ConjugateGradient CG_1f(1.0e-8,10000); + CG_1f(HermOpEO_1f,src_o_1f,result_o_1f); + + + GparityFermionField src_o_2f(FrbGrid_2f_a); + GparityFermionField result_o_2f(FrbGrid_2f_a); + pickCheckerboard(Odd,src_o_2f,src_2f); + result_o_2f=Zero(); + + SchurDiagMooeeOperator HermOpEO_2f(setup.action2f()); + ConjugateGradient CG_2f(1.0e-8,10000); + CG_2f(HermOpEO_2f,src_o_2f,result_o_2f); + + RealD norm_1f = norm2(result_o_1f); + RealD norm_2f = norm2(result_o_2f); + + std::cout << "Test fermion inversion 2f: " << norm_2f << " 1f: " << norm_1f << std::endl; + } + + //Generate eta + RealD scale = std::sqrt(0.5); + + GparityFermionField eta_2f(FGrid_2f_a); + gaussian(is_4d ? RNG4_2f : RNG5_2f,eta_2f); eta_2f = eta_2f * scale; + + StandardFermionField eta_1f(FGrid_1f_a); + convertFermion1f_from_2f(eta_1f, eta_2f, nu, is_4d); + + setup.refreshAction(Umu_2f, eta_2f, Umu_1f, eta_1f); + + //Initial action is just |eta^2| + RealD S_1f, S_2f; + + setup.computeAction(S_2f, S_1f, Umu_2f, Umu_1f); + + std::cout << "Test Initial action 2f: " << S_2f << " 1f: " << S_1f << " diff: " << S_2f - S_1f << std::endl; + + //Do a random gauge field refresh + SU::HotConfiguration(RNG4_2f,Umu_2f); + copyConjGauge(Umu_1f, Umu_2f, nu); + + //Compute the action again + setup.computeAction(S_2f, S_1f, Umu_2f, Umu_1f); + + std::cout << "Test Action after gauge field randomize 2f: " << S_2f << " 1f: " << S_1f << " diff: " << S_2f - S_1f << std::endl; + + //Compute the derivative and test the conjugate relation + LatticeGaugeField deriv_2f(UGrid_2f); + LatticeGaugeField deriv_1f(UGrid_1f); + setup.computeDeriv(deriv_2f, deriv_1f, Umu_2f, Umu_1f); + + //Have to combine the two forces on the 1f by symmetrizing under the complex conjugate + { + RealD norm2_pre = norm2(deriv_1f); + LatticeGaugeField deriv_1f_shift = conjugate( Cshift(deriv_1f, nu, latt_2f[nu]) ); + deriv_1f = deriv_1f + deriv_1f_shift; + std::cout << "Test combine/symmetrize forces on 1f lattice, dS/dU : " << norm2_pre << " -> " << norm2(deriv_1f) << std::endl; + } + + LatticeGaugeField deriv_1f_from_2f(UGrid_1f); + copyConjGauge(deriv_1f_from_2f, deriv_2f, nu); + std::cout << "Test copy-conj 2f dS/dU to obtain equivalent 1f force : " << norm2(deriv_2f) << " -> " << norm2(deriv_1f_from_2f) << std::endl; + + LatticeGaugeField diff_deriv_1f = deriv_1f - deriv_1f_from_2f; + + std::cout << "Test dS/dU 1f constructed from 2f derivative: " << norm2(deriv_1f_from_2f) << " dS/dU 1f actual: " << norm2(deriv_1f) << " Norm of difference: " << norm2(diff_deriv_1f) << std::endl; + + std::cout<< GridLogMessage << "Done" <(argc, argv); + }else if(action == "EOFA"){ + runTest(argc, argv); + }else if(action == "DSDR"){ + runTest(argc,argv); + }else{ + assert(0); + } +} diff --git a/tests/forces/Test_mobius_force_eofa.cc b/tests/forces/Test_mobius_force_eofa.cc index eea3e3f4..1d25771a 100644 --- a/tests/forces/Test_mobius_force_eofa.cc +++ b/tests/forces/Test_mobius_force_eofa.cc @@ -89,7 +89,49 @@ int main (int argc, char** argv) ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false); GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + + //Check the rational approximation + { + RealD scale = std::sqrt(0.5); + LatticeFermion eta (Lop.FermionGrid()); + gaussian(RNG5,eta); eta = eta * scale; + + Meofa.refresh(U, eta); + + //Phi = M^{-1/2} eta + //M is Hermitian + //(Phi, M Phi) = eta^\dagger M^{-1/2} M M^{-1/2} eta = eta^\dagger eta + LatticeFermion phi = Meofa.getPhi(); + LatticeFermion Mphi(FGrid); + + Meofa.Meofa(U, phi, Mphi); + std::cout << "Computing inner product" << std::endl; + ComplexD inner = innerProduct(phi, Mphi); + ComplexD test = inner - norm2(eta); + + std::cout << "(phi, Mphi) - (eta,eta): " << test << " expect 0" << std::endl; + + assert(test.real() < 1e-8); + assert(test.imag() < 1e-8); + + //Another test is to use heatbath twice to apply M^{-1/2} to Phi then apply M + // M Phi' + //= M M^{-1/2} Phi + //= M M^{-1/2} M^{-1/2} eta + //= eta + Meofa.refresh(U, phi); + LatticeFermion phi2 = Meofa.getPhi(); + LatticeFermion test2(FGrid); + Meofa.Meofa(U, phi2, test2); + test2 = test2 - eta; + RealD test2_norm = norm2(test2); + std::cout << "|M M^{-1/2} M^{-1/2} eta - eta|^2 = " << test2_norm << " expect 0" << std::endl; + assert( test2_norm < 1e-8 ); + } + + Meofa.refresh(U, sRNG, RNG5 ); + RealD S = Meofa.S(U); // pdag M p // get the deriv of phidag M phi with respect to "U" diff --git a/tests/forces/Test_mobius_gparity_eofa_mixed.cc b/tests/forces/Test_mobius_gparity_eofa_mixed.cc new file mode 100644 index 00000000..d490e838 --- /dev/null +++ b/tests/forces/Test_mobius_gparity_eofa_mixed.cc @@ -0,0 +1,233 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/forces/Test_mobius_gparity_eofa_mixed.cc + +Copyright (C) 2017 + +Author: Christopher Kelly +Author: Peter Boyle +Author: David Murphy + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +using namespace std; +using namespace Grid; + ; + +typedef GparityWilsonImplD FermionImplPolicyD; +typedef GparityMobiusEOFAFermionD FermionActionD; +typedef typename FermionActionD::FermionField FermionFieldD; + +typedef GparityWilsonImplF FermionImplPolicyF; +typedef GparityMobiusEOFAFermionF FermionActionF; +typedef typename FermionActionF::FermionField FermionFieldF; + +NAMESPACE_BEGIN(Grid); + + template + class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + Integer TotalInnerIterations; //Number of inner CG iterations + Integer TotalOuterIterations; //Number of restarts + Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + + MixedPrecisionConjugateGradientOperatorFunction(RealD tol, + Integer maxinnerit, + Integer maxouterit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + InnerTolerance(tol), + MaxInnerIterations(maxinnerit), + MaxOuterIterations(maxouterit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5), + OuterLoopNormMult(100.) + { + }; + + void operator()(LinearOperatorBase &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<(&LinOpU); + assert(&(SchurOpU->_Mat)==&(LinOpD._Mat)); + + precisionChange(FermOpF.Umu, FermOpD.Umu); + + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// + MixedPrecisionConjugateGradient MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); + MPCG.InnerTolerance = InnerTolerance; + std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < seeds4({1,2,3,5}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGridD); RNG4.SeedFixedIntegers(seeds4); + + int threads = GridThread::GetThreads(); + std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; + + LatticeGaugeFieldD Ud(UGridD); + SU::HotConfiguration(RNG4,Ud); + + LatticeGaugeFieldF Uf(UGridF); + precisionChange(Uf, Ud); + + RealD b = 2.5; + RealD c = 1.5; + RealD mf = 0.01; + RealD mb = 1.0; + RealD M5 = 1.8; + FermionActionD::ImplParams params; + params.twists[0] = 1; //GPBC in X + params.twists[Nd-1] = 1; //APRD in T + + std::vector gtwists(4,0); + gtwists[0] = 1; + + ConjugateGimplD::setDirections(gtwists); + + FermionActionD LopD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, mf, mf, mb, 0.0, -1, M5, b, c, params); + FermionActionD RopD(Ud, *FGridD, *FrbGridD, *UGridD, *UrbGridD, mb, mf, mb, -1.0, 1, M5, b, c, params); + + FermionActionF LopF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, mf, mf, mb, 0.0, -1, M5, b, c, params); + FermionActionF RopF(Uf, *FGridF, *FrbGridF, *UGridF, *UrbGridF, mb, mf, mb, -1.0, 1, M5, b, c, params); + + + OneFlavourRationalParams OFRp(0.95, 100.0, 5000, 1.0e-12, 12); + ConjugateGradient CG(1.0e-10, 10000); + + + typedef SchurDiagMooeeOperator EOFAschuropD; + typedef SchurDiagMooeeOperator EOFAschuropF; + + EOFAschuropD linopL_D(LopD); + EOFAschuropD linopR_D(RopD); + + EOFAschuropF linopL_F(LopF); + EOFAschuropF linopR_F(RopF); + + typedef MixedPrecisionConjugateGradientOperatorFunction EOFA_mxCG; + + EOFA_mxCG MCG_L(1e-10, 10000, 1000, UGridF, FrbGridF, LopF, LopD, linopL_F, linopL_D); + MCG_L.InnerTolerance = 1e-5; + + EOFA_mxCG MCG_R(1e-10, 10000, 1000, UGridF, FrbGridF, RopF, RopD, linopR_F, linopR_D); + MCG_R.InnerTolerance = 1e-5; + + ExactOneFlavourRatioPseudoFermionAction MeofaD(LopD, RopD, CG, CG, CG, CG, CG, OFRp, true); + ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction MeofaMx(LopF, RopF, LopD, RopD, MCG_L, MCG_R, MCG_L, MCG_R, MCG_L, MCG_R, OFRp, true); + + FermionFieldD eta(FGridD); + gaussian(RNG5, eta); + + MeofaD.refresh(Ud, eta); + MeofaMx.refresh(Ud, eta); + + FermionFieldD diff_phi(FGridD); + diff_phi = MeofaD.getPhi() - MeofaMx.getPhi(); + + RealD n = norm2(diff_phi); + + std::cout << GridLogMessage << "Phi(double)=" << norm2(MeofaD.getPhi()) << " Phi(mixed)=" << norm2(MeofaMx.getPhi()) << " diff=" << n << std::endl; + + assert(n < 1e-8); + + RealD Sd = MeofaD.S(Ud); + RealD Smx = MeofaMx.S(Ud); + + std::cout << GridLogMessage << "Initial action double=" << Sd << " mixed=" << Smx << " diff=" << Sd-Smx << std::endl; + + assert(fabs(Sd-Smx) < 1e-6); + + SU::HotConfiguration(RNG4,Ud); + precisionChange(Uf, Ud); + + Sd = MeofaD.S(Ud); + Smx = MeofaMx.S(Ud); + + std::cout << GridLogMessage << "After randomizing U, action double=" << Sd << " mixed=" << Smx << " diff=" << Sd-Smx << std::endl; + + assert(fabs(Sd-Smx) < 1e-6); + + std::cout << GridLogMessage << "Done" << std::endl; + Grid_finalize(); +} diff --git a/tests/hmc/Test_action_dwf_gparity2fvs1f.cc b/tests/hmc/Test_action_dwf_gparity2fvs1f.cc new file mode 100644 index 00000000..830bcead --- /dev/null +++ b/tests/hmc/Test_action_dwf_gparity2fvs1f.cc @@ -0,0 +1,257 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: tests/hmc/Test_action_dwf_gparity2fvs1f.cc + + Copyright (C) 2015 + + Author: Christopher Kelly + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + + + +template +void copy2fTo1fFermionField(FermionField1f &out, const FermionField2f &in, int gpdir){ + auto f0_halfgrid = PeekIndex(in,0); //on 2f Grid + FermionField1f f0_fullgrid_dbl(out.Grid()); + Replicate(f0_halfgrid, f0_fullgrid_dbl); //double it up to live on the 1f Grid + + auto f1_halfgrid = PeekIndex(in,1); + FermionField1f f1_fullgrid_dbl(out.Grid()); + Replicate(f1_halfgrid, f1_fullgrid_dbl); + + const Coordinate &dim_2f = in.Grid()->GlobalDimensions(); + const Coordinate &dim_1f = out.Grid()->GlobalDimensions(); + + //We have to be careful for 5d fields; the s-direction is placed before the x,y,z,t and so we need to shift gpdir by 1 + std::cout << "gpdir " << gpdir << std::endl; + + gpdir+=1; + std::cout << "gpdir for 5D fields " << gpdir << std::endl; + + std::cout << "dim_2f " << dim_2f << std::endl; + std::cout << "dim_1f " << dim_1f << std::endl; + + assert(dim_1f[gpdir] == 2*dim_2f[gpdir]); + + LatticeInteger xcoor_1f(out.Grid()); //5d lattice integer + LatticeCoordinate(xcoor_1f,gpdir); + + int L = dim_2f[gpdir]; + + out = where(xcoor_1f < L, f0_fullgrid_dbl, f1_fullgrid_dbl); +} + +//Both have the same field type +void copy2fTo1fGaugeField(LatticeGaugeField &out, const LatticeGaugeField &in, int gpdir){ + LatticeGaugeField U_dbl(out.Grid()); + Replicate(in, U_dbl); + + LatticeGaugeField Uconj_dbl = conjugate( U_dbl ); + + const Coordinate &dim_2f = in.Grid()->GlobalDimensions(); + + LatticeInteger xcoor_1f(out.Grid()); + LatticeCoordinate(xcoor_1f,gpdir); + + int L = dim_2f[gpdir]; + + out = where(xcoor_1f < L, U_dbl, Uconj_dbl); +} + + +std::ostream & operator<<(std::ostream &os, const Coordinate &x){ + os << "("; + for(int i=0;i seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5_2f(FGrid_2f); RNG5_2f.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4_2f(UGrid_2f); RNG4_2f.SeedFixedIntegers(seeds4); + + std::cout << "Generating hot 2f gauge configuration" << std::endl; + LatticeGaugeField Umu_2f(UGrid_2f); + SU::HotConfiguration(RNG4_2f,Umu_2f); + + std::cout << "Copying 2f->1f gauge field" << std::endl; + LatticeGaugeField Umu_1f(UGrid_1f); + copy2fTo1fGaugeField(Umu_1f, Umu_2f, mu); + + typedef GparityWilsonImplR FermionImplPolicy2f; + typedef GparityDomainWallFermionR FermionAction2f; + typedef typename FermionAction2f::FermionField FermionField2f; + + typedef WilsonImplR FermionImplPolicy1f; + typedef DomainWallFermionR FermionAction1f; + typedef typename FermionAction1f::FermionField FermionField1f; + + std::cout << "Generating eta 2f" << std::endl; + FermionField2f eta_2f(FGrid_2f); + gaussian(RNG5_2f, eta_2f); + + RealD scale = std::sqrt(0.5); + eta_2f=eta_2f*scale; + + std::cout << "Copying 2f->1f eta" << std::endl; + FermionField1f eta_1f(FGrid_1f); + copy2fTo1fFermionField(eta_1f, eta_2f, mu); + + Real beta = 2.13; + Real light_mass = 0.01; + Real strange_mass = 0.032; + Real pv_mass = 1.0; + RealD M5 = 1.8; + + //Setup the Dirac operators + std::cout << "Initializing Dirac operators" << std::endl; + + FermionAction2f::ImplParams Params_2f; + Params_2f.twists[mu] = 1; + Params_2f.twists[Nd-1] = 1; //APBC in time direction + + //note 'Num' and 'Den' here refer to the determinant ratio, not the operator ratio in the pseudofermion action where the two are inverted + //to my mind the Pauli Villars and 'denominator' are synonymous but the Grid convention has this as the 'Numerator' operator in the RHMC implementation + FermionAction2f NumOp_2f(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f, *UrbGrid_2f, light_mass,M5,Params_2f); + FermionAction2f DenOp_2f(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f, *UrbGrid_2f, pv_mass, M5,Params_2f); + + FermionAction1f::ImplParams Params_1f; + Params_1f.boundary_phases[mu] = -1; //antiperiodic in doubled lattice in GP direction + Params_1f.boundary_phases[Nd-1] = -1; + + FermionAction1f NumOp_1f(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f, *UrbGrid_1f, light_mass,M5,Params_1f); + FermionAction1f DenOp_1f(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f, *UrbGrid_1f, pv_mass, M5,Params_1f); + + //Test the replication routines by running a CG on eta + double StoppingCondition = 1e-10; + double MaxCGIterations = 30000; + ConjugateGradient CG_2f(StoppingCondition,MaxCGIterations); + ConjugateGradient CG_1f(StoppingCondition,MaxCGIterations); + + NumOp_1f.ImportGauge(Umu_1f); + NumOp_2f.ImportGauge(Umu_2f); + + FermionField1f test_1f(FGrid_1f); + FermionField2f test_2f(FGrid_2f); + + MdagMLinearOperator Linop_1f(NumOp_1f); + MdagMLinearOperator Linop_2f(NumOp_2f); + + CG_1f(Linop_1f, eta_1f, test_1f); + CG_2f(Linop_2f, eta_2f, test_2f); + RealD test_1f_norm = norm2(test_1f); + RealD test_2f_norm = norm2(test_2f); + + std::cout << "Verification of replication routines: " << test_1f_norm << " " << test_2f_norm << " " << test_1f_norm - test_2f_norm << std::endl; + + +#if 1 + typedef GeneralEvenOddRatioRationalPseudoFermionAction Action2f; + typedef GeneralEvenOddRatioRationalPseudoFermionAction Action1f; + + RationalActionParams rational_params; + rational_params.inv_pow = 2; + rational_params.lo = 1e-5; + rational_params.hi = 32; + rational_params.md_degree = 16; + rational_params.action_degree = 16; + + Action2f action_2f(DenOp_2f, NumOp_2f, rational_params); + Action1f action_1f(DenOp_1f, NumOp_1f, rational_params); +#else + typedef TwoFlavourEvenOddRatioPseudoFermionAction Action2f; + typedef TwoFlavourEvenOddRatioPseudoFermionAction Action1f; + + Action2f action_2f(DenOp_2f, NumOp_2f, CG_2f, CG_2f); + Action1f action_1f(DenOp_1f, NumOp_1f, CG_1f, CG_1f); +#endif + + + std::cout << "Action refresh" << std::endl; + action_2f.refresh(Umu_2f, eta_2f); + action_1f.refresh(Umu_1f, eta_1f); + + std::cout << "Action compute post heatbath" << std::endl; + RealD S_2f = action_2f.S(Umu_2f); + RealD S_1f = action_1f.S(Umu_1f); + + std::cout << "Action comparison post heatbath" << std::endl; + std::cout << S_2f << " " << S_1f << " " << S_2f-S_1f << std::endl; + + //Change the gauge field between refresh and action eval else the matrix and inverse matrices all cancel and we just get |eta|^2 + SU::HotConfiguration(RNG4_2f,Umu_2f); + copy2fTo1fGaugeField(Umu_1f, Umu_2f, mu); + + //Now compute the action with the new gauge field + std::cout << "Action compute post gauge field update" << std::endl; + S_2f = action_2f.S(Umu_2f); + S_1f = action_1f.S(Umu_1f); + + std::cout << "Action comparison post gauge field update" << std::endl; + std::cout << S_2f << " " << S_1f << " " << S_2f-S_1f << std::endl; + + Grid_finalize(); +} // main + + diff --git a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc index 7f74d5d8..d4bfa0a5 100644 --- a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc +++ b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc @@ -58,7 +58,7 @@ int main(int argc, char **argv) { CheckpointerParameters CPparams; CPparams.config_prefix = "ckpoint_EODWF_lat"; CPparams.rng_prefix = "ckpoint_EODWF_rng"; - CPparams.saveInterval = 5; + CPparams.saveInterval = 1; CPparams.format = "IEEE64BIG"; TheHMC.Resources.LoadNerscCheckpointer(CPparams); @@ -79,7 +79,7 @@ int main(int argc, char **argv) { // that have a complex construction // standard RealD beta = 2.6 ; - const int nu = 3; + const int nu = 1; std::vector twists(Nd,0); twists[nu] = 1; ConjugateGimplD::setDirections(twists); diff --git a/tests/solver/Test_eofa_inv.cc b/tests/solver/Test_eofa_inv.cc new file mode 100644 index 00000000..564405c2 --- /dev/null +++ b/tests/solver/Test_eofa_inv.cc @@ -0,0 +1,125 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/solver/Test_eofa_inv.cc + +Copyright (C) 2017 + +Author: Christopher Kelly +Author: Peter Boyle +Author: David Murphy + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char** argv) +{ + Grid_init(&argc, &argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + const int Ls = 8; + + GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid); + GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid); + + // Want a different conf at every run + // First create an instance of an engine. + std::random_device rnd_device; + // Specify the engine and distribution. + std::mt19937 mersenne_engine(rnd_device()); + std::uniform_int_distribution dist(1, 100); + + auto gen = std::bind(dist, mersenne_engine); + std::vector seeds4(4); + generate(begin(seeds4), end(seeds4), gen); + + //std::vector seeds4({1,2,3,5}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + int threads = GridThread::GetThreads(); + std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; + + LatticeFermion phi (FGrid); gaussian(RNG5, phi); + LatticeFermion Mphi (FGrid); + LatticeFermion MphiPrime (FGrid); + + LatticeGaugeField U(UGrid); + SU::HotConfiguration(RNG4,U); + + //////////////////////////////////// + // Unmodified matrix element + //////////////////////////////////// + RealD b = 2.5; + RealD c = 1.5; + RealD mf = 0.01; + RealD mb = 1.0; + RealD M5 = 1.8; + MobiusEOFAFermionR Lop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, 0.0, -1, M5, b, c); + MobiusEOFAFermionR Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5, b, c); + OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-10, 12); + ConjugateGradient CG(1.0e-10, 5000); + ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false); + + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + + + //Random field + LatticeFermion eta(FGrid); + gaussian(RNG5,eta); + + //Check left inverse + LatticeFermion Meta(FGrid); + Meofa.Meofa(U, eta, Meta); + + LatticeFermion MinvMeta(FGrid); + Meofa.MeofaInv(U, Meta, MinvMeta); + + LatticeFermion diff = MinvMeta - eta; + + std::cout << GridLogMessage << "eta: " << norm2(eta) << " M*eta: " << norm2(Meta) << " M^{-1}*M*eta: " << norm2(MinvMeta) << " M^{-1}*M*eta - eta: " << norm2(diff) << " (expect 0)" << std::endl; + assert(norm2(diff) < 1e-8); + + //Check right inverse + LatticeFermion MinvEta(FGrid); + Meofa.MeofaInv(U, eta, MinvEta); + + LatticeFermion MMinvEta(FGrid); + Meofa.Meofa(U, MinvEta, MMinvEta); + + diff = MMinvEta - eta; + + std::cout << GridLogMessage << "eta: " << norm2(eta) << " M^{-1}*eta: " << norm2(MinvEta) << " M*M^{-1}*eta: " << norm2(MMinvEta) << " M*M^{-1}*eta - eta: " << norm2(diff) << " (expect 0)" << std::endl; + assert(norm2(diff) < 1e-8); + + std::cout << GridLogMessage << "Done" << std::endl; + Grid_finalize(); +} From e1e7b1e2245a2db5686ae1dd8ff27bf1f0b37314 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jun 2022 12:09:52 -0400 Subject: [PATCH 052/240] RNG fix --- Grid/lattice/Lattice_rng.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index 34df8da2..6857dc84 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -428,18 +428,20 @@ public: thread_for( lidx, _grid->lSites(), { int gidx; + int o_idx; + int i_idx; + int rank; Coordinate pcoor; Coordinate lcoor; Coordinate gcoor; _grid->LocalIndexToLocalCoor(lidx,lcoor); pcoor=_grid->ThisProcessorCoor(); _grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor); + _grid->GlobalCoorToGlobalIndex(gcoor,gidx); - int o_idx; - int i_idx; - int rank; _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); assert(rank == _grid->ThisRank() ); + int l_idx=generator_idx(o_idx,i_idx); _generators[l_idx] = master_engine; Skip(_generators[l_idx],gidx); // Skip to next RNG sequence From efd7338a0031c972cd3e01866c0e2c0f7f480ead Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jun 2022 12:10:27 -0400 Subject: [PATCH 053/240] Allow dirichlet at round the world link --- Grid/qcd/action/filters/DirichletFilter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/filters/DirichletFilter.h b/Grid/qcd/action/filters/DirichletFilter.h index 4571c1de..e388891f 100644 --- a/Grid/qcd/action/filters/DirichletFilter.h +++ b/Grid/qcd/action/filters/DirichletFilter.h @@ -53,7 +53,7 @@ struct DirichletFilter: public MomentumFilterBase LatticeInteger coor(grid); LatCM zz(grid); zz = Zero(); for(int mu=0;muGlobalDimensions()[mu] ) ) { + if ( (Block[mu]) && (Block[mu] <= grid->GlobalDimensions()[mu] ) ) { // If costly could provide Grid earlier and precompute masks std::cout << GridLogMessage << " Dirichlet in mu="< Date: Mon, 27 Jun 2022 12:11:08 -0400 Subject: [PATCH 054/240] More verbose CG --- Grid/algorithms/iterative/ConjugateGradient.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index c20e267d..9a644c7e 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -175,13 +175,13 @@ public: << "\tTrue residual " << true_residual << "\tTarget " << Tolerance << std::endl; - std::cout << GridLogIterative << "Time breakdown "< Date: Mon, 27 Jun 2022 12:14:57 -0400 Subject: [PATCH 055/240] Spelling correction --- Grid/algorithms/iterative/ConjugateGradientMultiShift.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h index 3b079e99..0e70916f 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h @@ -44,7 +44,7 @@ public: using OperatorFunction::operator(); - RealD Tolerance; + // RealD Tolerance; Integer MaxIterations; Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion std::vector IterationsToCompleteShift; // Iterations for this shift @@ -324,8 +324,8 @@ public: std::cout << GridLogMessage << "Time Breakdown "< Date: Mon, 27 Jun 2022 12:15:55 -0400 Subject: [PATCH 056/240] Allow frequency=0 to disable --- Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h index 2c2402f8..b4417cac 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h @@ -185,7 +185,7 @@ NAMESPACE_BEGIN(Grid); auto grid = NumOp.FermionGrid(); auto r=rand(); grid->Broadcast(0,r); - if ( (r%param.BoundsCheckFreq)==0 ) { + if ( param.BoundsCheckFreq && ((r%param.BoundsCheckFreq)==0) ) { FermionField gauss(NumOp.FermionRedBlackGrid()); gauss = Noise; HighBoundCheck(MdagM,gauss,param.hi); From 4ac1094856cd52c3e3c2b817762ee63b17a720c6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Jun 2022 12:16:24 -0400 Subject: [PATCH 057/240] Updated config commands --- configure.ac | 11 +++++++---- systems/Crusher/config-command | 1 + systems/Summit/config-command | 6 +++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/configure.ac b/configure.ac index c8023939..528f0125 100644 --- a/configure.ac +++ b/configure.ac @@ -556,16 +556,19 @@ esac AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdevice], [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no]) case ${ac_SETDEVICE} in - yes);; - no) + yes) + echo ENABLE SET DEVICE + ;; + *) AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] ) + echo DISABLE SET DEVICE ;; esac ######################################################### ###################### Shared memory intranode ######### ######################################################### -AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no], +AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no|none], [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no]) case ${ac_SHM} in @@ -585,7 +588,7 @@ case ${ac_SHM} in AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] ) ;; - shmnone | no) + shmnone | no | none) AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] ) ;; diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index c93ea9c8..9e68a354 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -1,6 +1,7 @@ ../../configure --enable-comms=mpi-auto \ --enable-unified=no \ --enable-shm=nvlink \ +--enable-debug \ --enable-accelerator=hip \ --enable-gen-simd-width=64 \ --enable-simd=GPU \ diff --git a/systems/Summit/config-command b/systems/Summit/config-command index 46e37af1..4caf652e 100644 --- a/systems/Summit/config-command +++ b/systems/Summit/config-command @@ -2,12 +2,12 @@ --enable-simd=GPU \ --enable-gen-simd-width=32 \ --enable-unified=no \ - --enable-shm=nvlink \ + --enable-shm=no \ --disable-gparity \ - --enable-setdevice \ + --disable-setdevice \ --disable-fermion-reps \ --enable-accelerator=cuda \ - --disable-accelerator-cshift \ + --enable-accelerator-cshift \ --prefix /ccs/home/paboyle/prefix \ CXX=nvcc \ LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \ From 87ad76d81be5c15fdc18016d60b96efa17c00f17 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 30 Jun 2022 13:42:46 -0400 Subject: [PATCH 058/240] Initialise timeval --- Grid/perfmon/Timer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/perfmon/Timer.h b/Grid/perfmon/Timer.h index 2a44faee..02c23a62 100644 --- a/Grid/perfmon/Timer.h +++ b/Grid/perfmon/Timer.h @@ -39,9 +39,9 @@ NAMESPACE_BEGIN(Grid) // C++11 time facilities better? inline double usecond(void) { struct timeval tv; -#ifdef TIMERS_ON + tv.tv_sec = 0; + tv.tv_usec = 0; gettimeofday(&tv,NULL); -#endif return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec; } From 220050822a6e51dbbc921f9cb0d0bb02018e1151 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 30 Jun 2022 13:43:27 -0400 Subject: [PATCH 059/240] Speed up M5D and M5Ddag --- .../implementation/CayleyFermion5Dcache.h | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index d2537ccf..1581bee4 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -66,18 +66,17 @@ CayleyFermion5D::M5D(const FermionField &psi_i, M5Dcalls++; M5Dtime-=usecond(); - uint64_t nloop = grid->oSites()/Ls; + uint64_t nloop = grid->oSites(); accelerator_for(sss,nloop,Simd::Nsimd(),{ - uint64_t ss= sss*Ls; + uint64_t s = sss%Ls; + uint64_t ss= sss-s; typedef decltype(coalescedRead(psi[0])) spinor; spinor tmp1, tmp2; - for(int s=0;s::M5Ddag(const FermionField &psi_i, M5Dcalls++; M5Dtime-=usecond(); - uint64_t nloop = grid->oSites()/Ls; + uint64_t nloop = grid->oSites(); accelerator_for(sss,nloop,Simd::Nsimd(),{ - uint64_t ss=sss*Ls; + uint64_t s = sss%Ls; + uint64_t ss= sss-s; typedef decltype(coalescedRead(psi[0])) spinor; spinor tmp1,tmp2; - for(int s=0;s Date: Thu, 30 Jun 2022 13:44:09 -0400 Subject: [PATCH 060/240] Rough flop counting, need to add M5D, M5Ddag, MooeeInv flops --- tests/Test_dwf_mixedcg_prec.cc | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index da0b54cd..da71f72d 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -46,7 +46,7 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); - const int Ls=8; + const int Ls=12; std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; @@ -94,13 +94,32 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; MixedPrecisionConjugateGradient mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); - mCG(src_o,result_o); - + double t1,t2,flops; + int iters; + for(int i=0;i<100;i++){ + result_o = Zero(); + t1=usecond(); + mCG(src_o,result_o); + t2=usecond(); + iters = mCG.TotalInnerIterations; //Number of inner CG iterations + flops = 1320.0*2*FGrid->gSites()*iters; + std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< CG(1.0e-8,10000); - CG(HermOpEO,src_o,result_o_2); - - MemoryManager::Print(); + for(int i=0;i<100;i++){ + result_o_2 = Zero(); + t1=usecond(); + CG(HermOpEO,src_o,result_o_2); + t2=usecond(); + iters = CG.IterationsToComplete; + flops = 1320.0*2*FGrid->gSites()*iters; + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< Date: Thu, 30 Jun 2022 13:45:07 -0400 Subject: [PATCH 061/240] Dirichlet implementation --- .../implementation/WilsonFermion5DImplementation.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 681a6914..51c7df57 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -92,15 +92,19 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]); } - if ( p.dirichlet.size() ) { + if ( p.dirichlet.size() == Nd+1) { Coordinate block = p.dirichlet; - assert(block.size()==Nd+1); if ( block[0] || block[1] || block[2] || block[3] || block[4] ){ Dirichlet = 1; Block = block; } + } else { + Coordinate block(Nd+1,0); + Block = block; } - + + ZeroCounters(); + if (Impl::LsVectorised) { int nsimd = Simd::Nsimd(); From d03152fac47d7f2d1d0d5e3da317833e34210db3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 30 Jun 2022 13:49:35 -0400 Subject: [PATCH 062/240] New file under debug --- HMC/Mobius2p1f_DD_RHMC_96I.cc | 419 ++++++++++++++++++++++++++++++++++ 1 file changed, 419 insertions(+) create mode 100644 HMC/Mobius2p1f_DD_RHMC_96I.cc diff --git a/HMC/Mobius2p1f_DD_RHMC_96I.cc b/HMC/Mobius2p1f_DD_RHMC_96I.cc new file mode 100644 index 00000000..a6a2f26c --- /dev/null +++ b/HMC/Mobius2p1f_DD_RHMC_96I.cc @@ -0,0 +1,419 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_hmc_EODWFRatio.cc + +Copyright (C) 2015-2016 + +Author: Peter Boyle +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + int threads = GridThread::GetThreads(); + + // Typedefs to simplify notation + typedef WilsonImplR FermionImplPolicy; + typedef MobiusFermionR FermionAction; + typedef typename FermionAction::FermionField FermionField; + + typedef Grid::XmlReader Serialiser; + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + IntegratorParameters MD; + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Leap Frog"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Force Gradient"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = 6; + MD.trajL = 1.0; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 1; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("ColdStart"); + HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_DDHMC_lat"; + CPparams.rng_prefix = "ckpoint_DDHMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + // Construct observables + // here there is too much indirection + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + // Real beta = 2.31; + // Real light_mass = 5.4e-4; + Real beta = 2.13; + Real light_mass = 7.8e-4; + Real strange_mass = 0.02132; + Real pv_mass = 1.0; + // std::vector hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + + // FIXME: + // Same in MC and MD + // Need to mix precision too + OneFlavourRationalParams SFRp; // Strange + SFRp.lo = 4.0e-3; + SFRp.hi = 90.0; + SFRp.MaxIter = 60000; + SFRp.tolerance= 1.0e-8; + SFRp.mdtolerance= 1.0e-4; + SFRp.degree = 12; + SFRp.precision= 50; + SFRp.BoundsCheckFreq=0; + + OneFlavourRationalParams OFRp; // Up/down + OFRp.lo = 2.0e-5; + OFRp.hi = 90.0; + OFRp.MaxIter = 60000; + OFRp.tolerance= 1.0e-7; + OFRp.mdtolerance= 1.0e-4; + // OFRp.degree = 20; converges + // OFRp.degree = 16; + OFRp.degree = 12; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + Coordinate CommDim(Nd); + for(int d=0;d1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + + Coordinate Block4(Nd); + // Block4[0] = Dirichlet[1]; + // Block4[1] = Dirichlet[2]; + // Block4[2] = Dirichlet[3]; + Block4[0] = 0; + Block4[1] = 0; + Block4[2] = 0; + Block4[3] = Dirichlet[4]; + + int Width=3; + TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4,Width)); + + ////////////////////////// + // Fermion Grid + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + + + // These lines are unecessary if BC are all periodic + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + Params.dirichlet=NonDirichlet; + FermionAction::ImplParams ParamsDir(boundary); + ParamsDir.dirichlet=Dirichlet; + + // double StoppingCondition = 1e-14; + // double MDStoppingCondition = 1e-9; + double StoppingCondition = 1e-8; + double MDStoppingCondition = 1e-6; + double MaxCGIterations = 300000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(4); + ActionLevel Level3(8); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir); + FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, ParamsDir); + + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp); + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp); + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp); + Level1.push_back(&StrangePseudoFermionBdy); + Level2.push_back(&StrangePseudoFermionLocal); + Level1.push_back(&StrangePseudoFermionPVBdy); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + std::vector dirichlet_den; + std::vector dirichlet_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); dirichlet_den.push_back(0); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + std::vector *> Bdys; + + for(int h=0;h(*Numerators[h],*Denominators[h],MDCG,CG)); + } else { + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + } + } + + int nquo=Quotients.size(); + Level1.push_back(Bdys[0]); + Level1.push_back(Bdys[1]); + for(int h=0;h SdagS(StrangeOp); + HighBoundCheck(SdagS,vec,SFRp.hi); + ChebyBoundsCheck(SdagS,vec,SFRp.lo,SFRp.hi); + std::cout << "Strange inversion"<Mass() < UdagU(*Denominators[0]); + HighBoundCheck(UdagU,vec,OFRp.hi); + ChebyBoundsCheck(UdagU,vec,OFRp.lo,OFRp.hi); + std::cout << "light inversion"< SddagSd(StrangeOpDir); + HighBoundCheck(SddagSd,vec,OFRp.hi); + ChebyBoundsCheck(SddagSd,vec,OFRp.lo,OFRp.hi); + std::cout << "strange dirichlet inversion"<Mass()< UddagUd(*Numerators[0]); + HighBoundCheck(UddagUd,vec,OFRp.hi); + ChebyBoundsCheck(UddagUd,vec,OFRp.lo,OFRp.hi); + std::cout << "light dirichlet inversion"< Cheby(bound,90.,order); + FunctionHermOp OpCheby(Cheby,UddagUd); + PlainHermOp Op (UddagUd); + ImplicitlyRestartedLanczos IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt); + std::vector eval(Nm); + std::vector evec(Nm,rbgrid); + FermionField src(rbgrid);src = 1.0; + IRL.calc(eval,evec,src,Nconv); + + FermionField tmp(rbgrid); + FermionField ftmp(grid); + FermionField ftmp4(grid4); + for(int ev=0;ev Cheby(bound,90.,order); + FunctionHermOp OpCheby(Cheby,UdagU); + PlainHermOp Op (UdagU); + ImplicitlyRestartedLanczos IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt); + std::vector eval(Nm); + std::vector evec(Nm,rbgrid); + FermionField src(rbgrid); src = 1.0; + IRL.calc(eval,evec,src,Nconv); + + FermionField tmp(rbgrid); + FermionField ftmp(grid); + FermionField ftmp4(grid4); + for(int e=0;e Date: Thu, 30 Jun 2022 13:49:50 -0400 Subject: [PATCH 063/240] Create a new RNG file --- HMC/RNGstate.cc | 53 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 HMC/RNGstate.cc diff --git a/HMC/RNGstate.cc b/HMC/RNGstate.cc new file mode 100644 index 00000000..a595a9e6 --- /dev/null +++ b/HMC/RNGstate.cc @@ -0,0 +1,53 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: + +Copyright (C) 2015-2016 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +int main(int argc, char **argv) +{ + using namespace Grid; + + Grid_init(&argc, &argv); + + Coordinate latt4 = GridDefaultLatt(); + Coordinate mpi = GridDefaultMpi(); + Coordinate simd = GridDefaultSimd(Nd,vComplexD::Nsimd()); + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4,simd,mpi); + + GridSerialRNG sRNG; sRNG.SeedUniqueString(std::string("The Serial RNG")); + GridParallelRNG pRNG(UGrid); pRNG.SeedUniqueString(std::string("The 4D RNG")); + + std::string rngfile("ckpoint_rng.0"); + NerscIO::writeRNGState(sRNG, pRNG, rngfile); + + Grid_finalize(); +} + + + From 808bb592067012985be06ab90d8dec702ffaf41e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 30 Jun 2022 13:50:09 -0400 Subject: [PATCH 064/240] Mixed prec DD-RHMC --- HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc | 444 ++++++++++++++++++++++++++++ 1 file changed, 444 insertions(+) create mode 100644 HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc diff --git a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc new file mode 100644 index 00000000..a9b5dc7e --- /dev/null +++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc @@ -0,0 +1,444 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_hmc_EODWFRatio.cc + +Copyright (C) 2015-2016 + +Author: Peter Boyle +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +template + class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + Integer TotalInnerIterations; //Number of inner CG iterations + Integer TotalOuterIterations; //Number of restarts + Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + + MixedPrecisionConjugateGradientOperatorFunction(RealD tol, + Integer maxinnerit, + Integer maxouterit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + InnerTolerance(tol), + MaxInnerIterations(maxinnerit), + MaxOuterIterations(maxouterit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5), + OuterLoopNormMult(100.) + { + /* Debugging instances of objects; references are stored + std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " < &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<(&LinOpU); + + // std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <_Mat)<_Mat)==&(LinOpD._Mat)); + + //////////////////////////////////////////////////////////////////////////////////// + // Must snarf a single precision copy of the gauge field in Linop_d argument + //////////////////////////////////////////////////////////////////////////////////// + typedef typename FermionOperatorF::GaugeField GaugeFieldF; + typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF; + typedef typename FermionOperatorD::GaugeField GaugeFieldD; + typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD; + + GridBase * GridPtrF = SinglePrecGrid4; + GridBase * GridPtrD = FermOpD.Umu.Grid(); + GaugeFieldF U_f (GridPtrF); + GaugeLinkFieldF Umu_f(GridPtrF); + // std::cout << " Dim gauge field "<Nd()<Nd()<(FermOpD.Umu, mu); + precisionChange(Umu_f,Umu_d); + PokeIndex(FermOpF.Umu, Umu_f, mu); + } + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// + MixedPrecisionConjugateGradient MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); + std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < HMCWrapper; + // MD.name = std::string("Leap Frog"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Force Gradient"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = 4; + MD.trajL = 1.0; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 1; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("ColdStart"); + HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_DDHMC_lat"; + CPparams.rng_prefix = "ckpoint_DDHMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + // Construct observables + // here there is too much indirection + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + Real beta = 2.31; + // Real light_mass = 5.4e-4; + Real light_mass = 7.8e-4; + Real strange_mass = 0.02132; + Real pv_mass = 1.0; + std::vector hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + + // FIXME: + // Same in MC and MD + // Need to mix precision too + OneFlavourRationalParams SFRp; // Strange + SFRp.lo = 4.0e-3; + SFRp.hi = 90.0; + SFRp.MaxIter = 60000; + SFRp.tolerance= 1.0e-8; + SFRp.mdtolerance= 1.0e-6; + SFRp.degree = 12; + SFRp.precision= 50; + SFRp.BoundsCheckFreq=0; + + OneFlavourRationalParams OFRp; // Up/down + OFRp.lo = 2.0e-5; + OFRp.hi = 90.0; + OFRp.MaxIter = 60000; + OFRp.tolerance= 1.0e-8; + OFRp.mdtolerance= 1.0e-6; + // OFRp.degree = 20; converges + // OFRp.degree = 16; + OFRp.degree = 12; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG; + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + Coordinate CommDim(Nd); + for(int d=0;d1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + + Coordinate Block4(Nd); + Block4[0] = Dirichlet[1]; + Block4[1] = Dirichlet[2]; + Block4[2] = Dirichlet[3]; + Block4[3] = Dirichlet[4]; + + int Width=3; + TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4,Width)); + + ////////////////////////// + // Fermion Grids + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd()); + auto GridPtrF = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi); + auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF); + auto FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF); + auto FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + LatticeGaugeFieldF UF(GridPtrF); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + + + // These lines are unecessary if BC are all periodic + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + Params.dirichlet=NonDirichlet; + FermionAction::ImplParams ParamsDir(boundary); + ParamsDir.dirichlet=Dirichlet; + + // double StoppingCondition = 1e-14; + // double MDStoppingCondition = 1e-9; + double StoppingCondition = 1e-10; + double MDStoppingCondition = 1e-7; + double MDStoppingConditionLoose = 1e-6; + double MaxCGIterations = 300000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(4); + ActionLevel Level3(8); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir); + FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, ParamsDir); + + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp); + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp); + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp); + Level1.push_back(&StrangePseudoFermionBdy); + Level2.push_back(&StrangePseudoFermionLocal); + Level1.push_back(&StrangePseudoFermionPVBdy); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + std::vector dirichlet_den; + std::vector dirichlet_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); dirichlet_den.push_back(0); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector DenominatorsF; + std::vector *> Quotients; + std::vector *> Bdys; + std::vector ActionMPCG; + std::vector MPCG; + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + std::vector LinOpD; + std::vector LinOpF; + + for(int h=0;h(*Numerators[h],*Denominators[h],MDCG,CG)); + Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); + } else { + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + } + } + + int nquo=Quotients.size(); + Level1.push_back(Bdys[0]); + Level1.push_back(Bdys[1]); + for(int h=0;h Date: Thu, 30 Jun 2022 14:53:12 -0400 Subject: [PATCH 065/240] Remove debug --- systems/Crusher/config-command | 1 - 1 file changed, 1 deletion(-) diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index 9e68a354..c93ea9c8 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -1,7 +1,6 @@ ../../configure --enable-comms=mpi-auto \ --enable-unified=no \ --enable-shm=nvlink \ ---enable-debug \ --enable-accelerator=hip \ --enable-gen-simd-width=64 \ --enable-simd=GPU \ From 751a4562d709f15cb7aeba72941554a6c6dbd04e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 1 Jul 2022 09:41:43 -0400 Subject: [PATCH 066/240] Timing improvement --- Grid/algorithms/iterative/ConjugateGradient.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index 9a644c7e..b65eea46 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -120,6 +120,9 @@ public: SolverTimer.Start(); int k; for (k = 1; k <= MaxIterations; k++) { + + GridStopWatch IterationTimer; + IterationTimer.Start(); c = cp; MatrixTimer.Start(); @@ -152,12 +155,13 @@ public: LinearCombTimer.Stop(); LinalgTimer.Stop(); + IterationTimer.Stop(); if ( (k % 500) == 0 ) { std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; } else { std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k - << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; + << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl; } // Stopping condition From 57b442d0deb18d44a98322b52b54a3892f0fc9dc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 1 Jul 2022 09:42:17 -0400 Subject: [PATCH 067/240] Log memory operations --- Grid/log/Log.cc | 6 ++++-- Grid/log/Log.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Grid/log/Log.cc b/Grid/log/Log.cc index 31f9a3f3..63bc454f 100644 --- a/Grid/log/Log.cc +++ b/Grid/log/Log.cc @@ -65,6 +65,7 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL"); GridLogger GridLogError (1, "Error" , GridLogColours, "RED"); GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW"); GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL"); +GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL"); GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); @@ -72,9 +73,10 @@ GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE"); void GridLogConfigure(std::vector &logstreams) { - GridLogError.Active(0); + GridLogError.Active(1); GridLogWarning.Active(0); GridLogMessage.Active(1); // at least the messages should be always on + GridLogMemory.Active(0); // at least the messages should be always on GridLogIterative.Active(0); GridLogDebug.Active(0); GridLogPerformance.Active(0); @@ -83,7 +85,7 @@ void GridLogConfigure(std::vector &logstreams) { GridLogHMC.Active(1); for (int i = 0; i < logstreams.size(); i++) { - if (logstreams[i] == std::string("Error")) GridLogError.Active(1); + if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1); if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1); if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0); if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); diff --git a/Grid/log/Log.h b/Grid/log/Log.h index b1696fee..4d512ff6 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -183,6 +183,7 @@ extern GridLogger GridLogPerformance; extern GridLogger GridLogIterative ; extern GridLogger GridLogIntegrator ; extern GridLogger GridLogHMC; +extern GridLogger GridLogMemory; extern Colours GridLogColours; std::string demangle(const char* name) ; From bd99fd608c01c7778b3c50033942a9558ee572ea Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 1 Jul 2022 09:42:53 -0400 Subject: [PATCH 068/240] Introduce a non-default stream for compute operatoins --- Grid/threads/Accelerator.cc | 4 ++++ Grid/threads/Accelerator.h | 18 ++++++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index cbc798a9..092d46b3 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -16,6 +16,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; #ifdef GRID_CUDA cudaDeviceProp *gpu_props; cudaStream_t copyStream; +cudaStream_t cpuStream; void acceleratorInit(void) { int nDevices = 1; @@ -98,6 +99,7 @@ void acceleratorInit(void) cudaSetDevice(device); cudaStreamCreate(©Stream); + cudaStreamCreate(&cpuStream); const int len=64; char busid[len]; if( rank == world_rank ) { @@ -112,6 +114,7 @@ void acceleratorInit(void) #ifdef GRID_HIP hipDeviceProp_t *gpu_props; hipStream_t copyStream; +hipStream_t cpuStream; void acceleratorInit(void) { int nDevices = 1; @@ -180,6 +183,7 @@ void acceleratorInit(void) #endif hipSetDevice(device); hipStreamCreate(©Stream); + hipStreamCreate(&cpuStream); const int len=64; char busid[len]; if( rank == world_rank ) { diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index a5fb7aa8..d88c08b4 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -107,6 +107,7 @@ void acceleratorInit(void); extern int acceleratorAbortOnGpuError; extern cudaStream_t copyStream; +extern cudaStream_t cpuStream; accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT @@ -134,7 +135,7 @@ inline void cuda_mem(void) }; \ dim3 cu_threads(nsimd,acceleratorThreads(),1); \ dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - LambdaApply<<>>(num1,num2,nsimd,lambda); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ } #define accelerator_for6dNB(iter1, num1, \ @@ -153,7 +154,7 @@ inline void cuda_mem(void) }; \ dim3 cu_blocks (num1,num2,num3); \ dim3 cu_threads(num4,num5,num6); \ - Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ + Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ } template __global__ @@ -189,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, #define accelerator_barrier(dummy) \ { \ - cudaDeviceSynchronize(); \ + cudaStreamSynchronize(cpuStream); \ cudaError err = cudaGetLastError(); \ if ( cudaSuccess != err ) { \ printf("accelerator_barrier(): Cuda error %s \n", \ @@ -339,6 +340,7 @@ NAMESPACE_BEGIN(Grid); #define accelerator_inline __host__ __device__ inline extern hipStream_t copyStream; +extern hipStream_t cpuStream; /*These routines define mapping from thread grid to loop & vector lane indexing */ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT @@ -360,12 +362,12 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \ - 0,0, \ - num1,num2,nsimd, lambda); \ + 0,cpuStream, \ + num1,num2,nsimd, lambda); \ } else { \ hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ - 0,0, \ - num1,num2,nsimd, lambda); \ + 0,cpuStream, \ + num1,num2,nsimd, lambda); \ } \ } @@ -398,7 +400,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) #define accelerator_barrier(dummy) \ { \ - hipDeviceSynchronize(); \ + hipStreamSynchronize(cpuStream); \ auto err = hipGetLastError(); \ if ( err != hipSuccess ) { \ printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ From 588c2f3cb191d9c438399e5039847285d3f9e4a9 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 1 Jul 2022 09:44:58 -0400 Subject: [PATCH 069/240] Faster axpy_norm and innerProduct --- Grid/lattice/Lattice_reduction.h | 51 +++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 0ddac437..16feb856 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -232,6 +232,7 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & const uint64_t sites = grid->oSites(); // Might make all code paths go this way. +#if 0 typedef decltype(innerProductD(vobj(),vobj())) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; @@ -241,15 +242,31 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & autoView( right_v,right, AcceleratorRead); // GPU - SIMT lane compliance... - accelerator_for( ss, sites, 1,{ - auto x_l = left_v[ss]; - auto y_l = right_v[ss]; - inner_tmp_v[ss]=innerProductD(x_l,y_l); + accelerator_for( ss, sites, nsimd,{ + auto x_l = left_v(ss); + auto y_l = right_v(ss); + coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l)); }); } +#else + typedef decltype(innerProduct(vobj(),vobj())) inner_t; + Vector inner_tmp(sites); + auto inner_tmp_v = &inner_tmp[0]; + + { + autoView( left_v , left, AcceleratorRead); + autoView( right_v,right, AcceleratorRead); + // GPU - SIMT lane compliance... + accelerator_for( ss, sites, nsimd,{ + auto x_l = left_v(ss); + auto y_l = right_v(ss); + coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); + }); + } +#endif // This is in single precision and fails some tests - auto anrm = sum(inner_tmp_v,sites); + auto anrm = sumD(inner_tmp_v,sites); nrm = anrm; return nrm; } @@ -283,7 +300,7 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt conformable(x,y); typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_typeD vector_type; + // typedef typename vobj::vector_typeD vector_type; RealD nrm; GridBase *grid = x.Grid(); @@ -295,17 +312,29 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt autoView( x_v, x, AcceleratorRead); autoView( y_v, y, AcceleratorRead); autoView( z_v, z, AcceleratorWrite); - +#if 0 typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; - accelerator_for( ss, sites, 1,{ - auto tmp = a*x_v[ss]+b*y_v[ss]; - inner_tmp_v[ss]=innerProductD(tmp,tmp); - z_v[ss]=tmp; + accelerator_for( ss, sites, nsimd,{ + auto tmp = a*x_v(ss)+b*y_v(ss); + coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp)); + coalescedWrite(z_v[ss],tmp); }); nrm = real(TensorRemove(sum(inner_tmp_v,sites))); +#else + typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; + Vector inner_tmp(sites); + auto inner_tmp_v = &inner_tmp[0]; + + accelerator_for( ss, sites, nsimd,{ + auto tmp = a*x_v(ss)+b*y_v(ss); + coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); + coalescedWrite(z_v[ss],tmp); + }); + nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); +#endif grid->GlobalSum(nrm); return nrm; } From 33e4a0caee673c2bdebfdf5b8dce9412d52bbe5f Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Fri, 1 Jul 2022 14:10:59 -0400 Subject: [PATCH 070/240] Imported changes from feature/gparity_HMC branch: Rework of WilsonFlow class Fixed logic error in smear method where the step index was initialized to 1 rather than 0, resulting in the logged output value of tau being too large by epsilon Previously smear_adaptive would maintain the current value of tau as a class member variable whereas smear would compute it separately; now both methods maintain the current value internally and it is updated by the evolve_step routines. Both evolve methods are now const. smear_adaptive now also maintains the current value of epsilon internally, allowing it to be a const method and also allowing the same class instance to be reused without needing to be reset Replaced the fixed evaluation of the plaquette energy density and plaquette topological charge during the smearing with a highly flexible general strategy where the user can add arbitrary measurements as functional objects that are evaluated at an arbitrary frequency By default the same plaquette-based measurements are performed, but additional example functions are provided where the smearing is performed with different choices of measurement that are returned as an array for further processing Added a method to compute the energy density using the Cloverleaf approach which has smaller discretization errors Added a new tensor utility operation, copyLane, which allows for the copying of a single SIMD lane between two instances of the same tensor type but potentially different precisions To LocalCoherenceLanczos, added the option to compute the high/low eval of the fine operator on every restart to aid in tuning the Chebyshev Added Test_field_array_io which demonstrates and tests a single-file write of an arbitrary array of fields Added Test_evec_compression which generates evecs using Lanczos and attempts to compress them using the local coherence technique Added Test_compressed_lanczos_gparity which demonstrates the local coherence Lanczos for G-parity BCs Added HMC main programs for the 40ID and 48ID G-parity lattices --- .../iterative/LocalCoherenceLanczos.h | 19 +- Grid/qcd/observables/topological_charge.h | 2 +- Grid/qcd/smearing/WilsonFlow.h | 200 +++- Grid/tensors/Tensor_extract_merge.h | 41 + HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc | 918 ++++++++++++++++++ HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc | 873 +++++++++++++++++ tests/IO/Test_field_array_io.cc | 184 ++++ .../Test_compressed_lanczos_gparity.cc | 485 +++++++++ tests/lanczos/Test_evec_compression.cc | 582 +++++++++++ 9 files changed, 3247 insertions(+), 57 deletions(-) create mode 100644 HMC/Mobius2p1fIDSDRGparityEOFA_40ID.cc create mode 100644 HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc create mode 100644 tests/IO/Test_field_array_io.cc create mode 100644 tests/lanczos/Test_compressed_lanczos_gparity.cc create mode 100644 tests/lanczos/Test_evec_compression.cc diff --git a/Grid/algorithms/iterative/LocalCoherenceLanczos.h b/Grid/algorithms/iterative/LocalCoherenceLanczos.h index 0a2fe55c..344a785a 100644 --- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h +++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h @@ -146,14 +146,21 @@ public: LinearOperatorBase &_Linop; RealD _coarse_relax_tol; std::vector &_subspace; + + int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator + //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult + //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these + //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1) + //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed ImplicitlyRestartedLanczosSmoothedTester(LinearFunction &Poly, OperatorFunction &smoother, LinearOperatorBase &Linop, std::vector &subspace, - RealD coarse_relax_tol=5.0e3) + RealD coarse_relax_tol=5.0e3, + int largestEvalIdxForReport=-1) : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace), - _coarse_relax_tol(coarse_relax_tol) + _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport) { }; //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection) @@ -179,6 +186,12 @@ public: <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv < ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors - ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); evals_coarse.resize(Nm); evec_coarse.resize(Nm,_CoarseGrid); diff --git a/Grid/qcd/observables/topological_charge.h b/Grid/qcd/observables/topological_charge.h index 4f116496..7c09a180 100644 --- a/Grid/qcd/observables/topological_charge.h +++ b/Grid/qcd/observables/topological_charge.h @@ -99,7 +99,7 @@ public: // using wilson flow by default here WilsonFlow WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval); WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau); - Real T0 = WF.energyDensityPlaquette(Usmear); + Real T0 = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear); std::cout << GridLogMessage << std::setprecision(std::numeric_limits::digits10 + 1) << "T0 : [ " << traj << " ] "<< T0 << std::endl; } diff --git a/Grid/qcd/smearing/WilsonFlow.h b/Grid/qcd/smearing/WilsonFlow.h index 19fd94e2..0d1ee5d2 100644 --- a/Grid/qcd/smearing/WilsonFlow.h +++ b/Grid/qcd/smearing/WilsonFlow.h @@ -7,6 +7,7 @@ Source file: ./lib/qcd/modules/plaquette.h Copyright (C) 2017 Author: Guido Cossu +Author: Christopher Kelly This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,28 +34,44 @@ NAMESPACE_BEGIN(Grid); template class WilsonFlow: public Smear{ +public: + //Store generic measurements to take during smearing process using std::function + typedef std::function FunctionType; //int: step, RealD: flow time, GaugeField : the gauge field + +private: unsigned int Nstep; - unsigned int measure_interval; - mutable RealD epsilon, taus; - + RealD epsilon; //for regular smearing this is the time step, for adaptive it is the initial time step + + std::vector< std::pair > functions; //The int maps to the measurement frequency mutable WilsonGaugeAction SG; - void evolve_step(typename Gimpl::GaugeField&) const; - void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD); - RealD tau(unsigned int t)const {return epsilon*(t+1.0); } + //Evolve the gauge field by 1 step and update tau + void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const; + //Evolve the gauge field by 1 step and update tau and the current time step eps + void evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps, RealD maxTau) const; public: INHERIT_GIMPL_TYPES(Gimpl) + void resetActions(){ functions.clear(); } + + void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); } + + //Set the class to perform the default measurements: + //the plaquette energy density every step + //the plaquette topological charge every 'topq_meas_interval' steps + //and output to stdout + void setDefaultMeasurements(int topq_meas_interval = 1); + explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1): Nstep(Nstep), epsilon(epsilon), - measure_interval(interval), SG(WilsonGaugeAction(3.0)) { // WilsonGaugeAction with beta 3.0 assert(epsilon > 0.0); LogMessage(); + setDefaultMeasurements(interval); } void LogMessage() { @@ -73,9 +90,29 @@ public: // undefined for WilsonFlow } - void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau); - RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const; - RealD energyDensityPlaquette(const GaugeField& U) const; + void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau) const; + + //Compute t^2 for time t from the plaquette + static RealD energyDensityPlaquette(const RealD t, const GaugeField& U); + + //Compute t^2 for time t from the 1x1 cloverleaf form + //t is the Wilson flow time + static RealD energyDensityCloverleaf(const RealD t, const GaugeField& U); + + //Evolve the gauge field by Nstep steps of epsilon and return the energy density computed every interval steps + //The smeared field is output as V + std::vector flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval = 1); + + //Version that does not return the smeared field + std::vector flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval = 1); + + + //Evolve the gauge field by Nstep steps of epsilon and return the Cloverleaf energy density computed every interval steps + //The smeared field is output as V + std::vector flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval = 1); + + //Version that does not return the smeared field + std::vector flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1); }; @@ -83,7 +120,7 @@ public: // Implementations //////////////////////////////////////////////////////////////////////////////// template -void WilsonFlow::evolve_step(typename Gimpl::GaugeField &U) const{ +void WilsonFlow::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{ GaugeField Z(U.Grid()); GaugeField tmp(U.Grid()); SG.deriv(U, Z); @@ -99,12 +136,13 @@ void WilsonFlow::evolve_step(typename Gimpl::GaugeField &U) const{ SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2 Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2 Gimpl::update_field(Z, U, -2.0*epsilon); // V(t+e) = exp(ep*Z)*W2 + tau += epsilon; } template -void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) { - if (maxTau - taus < epsilon){ - epsilon = maxTau-taus; +void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps, RealD maxTau) const{ + if (maxTau - tau < eps){ + eps = maxTau-tau; } //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; GaugeField Z(U.Grid()); @@ -114,95 +152,151 @@ void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real SG.deriv(U, Z); Zprime = -Z; Z *= 0.25; // Z0 = 1/4 * F(U) - Gimpl::update_field(Z, U, -2.0*epsilon); // U = W1 = exp(ep*Z0)*W0 + Gimpl::update_field(Z, U, -2.0*eps); // U = W1 = exp(ep*Z0)*W0 Z *= -17.0/8.0; SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1 Zprime += 2.0*tmp; Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1 - Gimpl::update_field(Z, U, -2.0*epsilon); // U_= W2 = exp(ep*Z)*W1 + Gimpl::update_field(Z, U, -2.0*eps); // U_= W2 = exp(ep*Z)*W1 Z *= -4.0/3.0; SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2 Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2 - Gimpl::update_field(Z, U, -2.0*epsilon); // V(t+e) = exp(ep*Z)*W2 + Gimpl::update_field(Z, U, -2.0*eps); // V(t+e) = exp(ep*Z)*W2 // Ramos - Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0 + Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0 // Compute distance as norm^2 of the difference GaugeField diffU = U - Uprime; RealD diff = norm2(diffU); // adjust integration step - taus += epsilon; + tau += eps; //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; - epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); + eps = eps*0.95*std::pow(1e-4/diff,1./3.); //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; } + template -RealD WilsonFlow::energyDensityPlaquette(unsigned int step, const GaugeField& U) const { - RealD td = tau(step); - return 2.0 * td * td * SG.S(U)/U.Grid()->gSites(); +RealD WilsonFlow::energyDensityPlaquette(const RealD t, const GaugeField& U){ + static WilsonGaugeAction SG(3.0); + return 2.0 * t * t * SG.S(U)/U.Grid()->gSites(); +} + +//Compute t^2 for time from the 1x1 cloverleaf form +template +RealD WilsonFlow::energyDensityCloverleaf(const RealD t, const GaugeField& U){ + typedef typename Gimpl::GaugeLinkField GaugeMat; + typedef typename Gimpl::GaugeField GaugeLorentz; + + assert(Nd == 4); + //E = 1/2 tr( F_munu F_munu ) + //However as F_numu = -F_munu, only need to sum the trace of the squares of the following 6 field strengths: + //F_01 F_02 F_03 F_12 F_13 F_23 + GaugeMat F(U.Grid()); + LatticeComplexD R(U.Grid()); + R = Zero(); + + for(int mu=0;mu<3;mu++){ + for(int nu=mu+1;nu<4;nu++){ + WilsonLoops::FieldStrength(F, U, mu, nu); + R = R + trace(F*F); + } + } + ComplexD out = sum(R); + out = t*t*out / RealD(U.Grid()->gSites()); + return -real(out); //minus sign necessary for +ve energy +} + + +template +std::vector WilsonFlow::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){ + std::vector out; + resetActions(); + addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ + std::cout << GridLogMessage << "[WilsonFlow] Computing plaquette energy density for step " << step << std::endl; + out.push_back( energyDensityPlaquette(t,U) ); + }); + smear(V,U); + return out; } template -RealD WilsonFlow::energyDensityPlaquette(const GaugeField& U) const { - return 2.0 * taus * taus * SG.S(U)/U.Grid()->gSites(); +std::vector WilsonFlow::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){ + GaugeField V(U); + return flowMeasureEnergyDensityPlaquette(V,U, measure_interval); } +template +std::vector WilsonFlow::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){ + std::vector out; + resetActions(); + addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ + std::cout << GridLogMessage << "[WilsonFlow] Computing Cloverleaf energy density for step " << step << std::endl; + out.push_back( energyDensityCloverleaf(t,U) ); + }); + smear(V,U); + return out; +} + +template +std::vector WilsonFlow::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){ + GaugeField V(U); + return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval); +} + + //#define WF_TIMING - - - template -void WilsonFlow::smear(GaugeField& out, const GaugeField& in) const { +void WilsonFlow::smear(GaugeField& out, const GaugeField& in) const{ out = in; - for (unsigned int step = 1; step <= Nstep; step++) { + RealD taus = 0.; + for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement auto start = std::chrono::high_resolution_clock::now(); - evolve_step(out); + evolve_step(out, taus); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration diff = end - start; #ifdef WF_TIMING std::cout << "Time to evolve " << diff.count() << " s\n"; #endif - std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " - << step << " " << tau(step) << " " - << energyDensityPlaquette(step,out) << std::endl; - if( step % measure_interval == 0){ - std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " - << step << " " - << WilsonLoops::TopologicalCharge(out) << std::endl; - } + //Perform measurements + for(auto const &meas : functions) + if( step % meas.first == 0 ) meas.second(step,taus,out); } } template -void WilsonFlow::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){ +void WilsonFlow::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau) const{ out = in; - taus = epsilon; + RealD taus = 0.; + RealD eps = epsilon; unsigned int step = 0; do{ step++; //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; - evolve_step_adaptive(out, maxTau); - std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " - << step << " " << taus << " " - << energyDensityPlaquette(out) << std::endl; - if( step % measure_interval == 0){ - std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " - << step << " " - << WilsonLoops::TopologicalCharge(out) << std::endl; - } + evolve_step_adaptive(out, taus, eps, maxTau); + //Perform measurements + for(auto const &meas : functions) + if( step % meas.first == 0 ) meas.second(step,taus,out); } while (taus < maxTau); - - - } +template +void WilsonFlow::setDefaultMeasurements(int topq_meas_interval){ + addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){ + std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl; + }); + addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){ + std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops::TopologicalCharge(U) << std::endl; + }); +} + + NAMESPACE_END(Grid); diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h index ea619d0f..ab14f81f 100644 --- a/Grid/tensors/Tensor_extract_merge.h +++ b/Grid/tensors/Tensor_extract_merge.h @@ -208,5 +208,46 @@ void merge(vobj &vec,const ExtractPointerArray &extracted, int offset) } + +////////////////////////////////////////////////////////////////////////////////// +//Copy a single lane of a SIMD tensor type from one object to another +//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type) +/////////////////////////////////////////////////////////////////////////////////// +template +accelerator_inline +void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in) +{ + static_assert( std::is_same::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same + + typedef typename vobjOut::vector_type ovector_type; + typedef typename vobjIn::vector_type ivector_type; + constexpr int owords=sizeof(vobjOut)/sizeof(ovector_type); + constexpr int iwords=sizeof(vobjIn)/sizeof(ivector_type); + static_assert( owords == iwords, "copyLane: Expected number of vector words in input and output objects to be equal" ); + + typedef typename vobjOut::scalar_type oscalar_type; + typedef typename vobjIn::scalar_type iscalar_type; + typedef typename ExtractTypeMap::extract_type oextract_type; + typedef typename ExtractTypeMap::extract_type iextract_type; + + typedef oextract_type * opointer; + typedef iextract_type * ipointer; + + constexpr int oNsimd=ovector_type::Nsimd(); + constexpr int iNsimd=ivector_type::Nsimd(); + + iscalar_type itmp; + oscalar_type otmp; + + opointer __restrict__ op = (opointer)&vecOut; + ipointer __restrict__ ip = (ipointer)&vecIn; + for(int w=0;w +Author: Peter Boyle + + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +using namespace Grid; + +//Production binary for the 40ID G-parity ensemble + +struct RatQuoParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters, + double, bnd_lo, + double, bnd_hi, + Integer, action_degree, + double, action_tolerance, + Integer, md_degree, + double, md_tolerance, + Integer, reliable_update_freq, + Integer, bnd_check_freq); + RatQuoParameters() { + bnd_lo = 1e-2; + bnd_hi = 30; + action_degree = 10; + action_tolerance = 1e-10; + md_degree = 10; + md_tolerance = 1e-8; + bnd_check_freq = 20; + reliable_update_freq = 50; + } + + void Export(RationalActionParams &into) const{ + into.lo = bnd_lo; + into.hi = bnd_hi; + into.action_degree = action_degree; + into.action_tolerance = action_tolerance; + into.md_degree = md_degree; + into.md_tolerance = md_tolerance; + into.BoundsCheckFreq = bnd_check_freq; + } +}; + +struct EOFAparameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters, + OneFlavourRationalParams, rat_params, + double, action_tolerance, + double, action_mixcg_inner_tolerance, + double, md_tolerance, + double, md_mixcg_inner_tolerance); + + EOFAparameters() { + action_mixcg_inner_tolerance = 1e-8; + action_tolerance = 1e-10; + md_tolerance = 1e-8; + md_mixcg_inner_tolerance = 1e-8; + + rat_params.lo = 1.0; + rat_params.hi = 25.0; + rat_params.MaxIter = 50000; + rat_params.tolerance= 1.0e-9; + rat_params.degree = 14; + rat_params.precision= 50; + } +}; + +struct EvolParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters, + Integer, StartTrajectory, + Integer, Trajectories, + Integer, SaveInterval, + Integer, Steps, + RealD, TrajectoryLength, + bool, MetropolisTest, + std::string, StartingType, + std::vector, GparityDirs, + std::vector, eofa_l, + RatQuoParameters, rat_quo_s, + RatQuoParameters, rat_quo_DSDR); + + EvolParameters() { + //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart + MetropolisTest = false; + StartTrajectory = 0; + Trajectories = 50; + SaveInterval = 5; + StartingType = "ColdStart"; + GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic + Steps = 5; + TrajectoryLength = 1.0; + } +}; + +bool fileExists(const std::string &fn){ + std::ifstream f(fn); + return f.good(); +} + + + + +struct LanczosParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters, + double, alpha, + double, beta, + double, mu, + int, ord, + int, n_stop, + int, n_want, + int, n_use, + double, tolerance); + + LanczosParameters() { + alpha = 35; + beta = 5; + mu = 0; + ord = 100; + n_stop = 10; + n_want = 10; + n_use = 15; + tolerance = 1e-6; + } +}; + + + +template +void computeEigenvalues(std::string param_file, + GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt, //expect lattice to have been initialized to something + FermionActionD &action, GridParallelRNG &rng){ + + LanczosParameters params; + if(fileExists(param_file)){ + std::cout << GridLogMessage << " Reading " << param_file << std::endl; + Grid::XmlReader rd(param_file); + read(rd, "LanczosParameters", params); + }else if(!GlobalSharedMemory::WorldRank){ + std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl; + std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl; + Grid::XmlWriter wr(param_file + ".templ"); + write(wr, "LanczosParameters", params); + } + + FermionFieldD gauss_o(rbGrid); + FermionFieldD gauss(Grid); + gaussian(rng, gauss); + pickCheckerboard(Odd, gauss_o, gauss); + + action.ImportGauge(latt); + + SchurDiagMooeeOperator hermop(action); + PlainHermOp hermop_wrap(hermop); + //ChebyshevLanczos Cheb(params.alpha, params.beta, params.mu, params.ord); + assert(params.mu == 0.0); + + Chebyshev Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1); + FunctionHermOp Cheb_wrap(Cheb, hermop); + + std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl; + ImplicitlyRestartedLanczos IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 50000); + + std::vector eval(params.n_use); + std::vector evec(params.n_use, rbGrid); + int Nconv; + IRL.calc(eval, evec, gauss_o, Nconv); + + std::cout << "Eigenvalues:" << std::endl; + for(int i=0;i +void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt, //expect lattice to have been initialized to something + FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng, + int inv_pow, const std::string &quark_descr, int action_or_md){ + assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2); + + FermionFieldD gauss_o(rbGrid); + FermionFieldD gauss(Grid); + gaussian(rng, gauss); + pickCheckerboard(Odd, gauss_o, gauss); + + numOp.ImportGauge(latt); + denOp.ImportGauge(latt); + + typedef typename FermionActionD::Impl_t FermionImplPolicyD; + SchurDifferentiableOperator MdagM(numOp); + SchurDifferentiableOperator VdagV(denOp); + + PowerMethod power_method; + RealD lambda_max; + + std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl; + + lambda_max = power_method(MdagM,gauss_o); + std::cout << GridLogMessage << "Got lambda_max "< +void checkEOFA(ExactOneFlavourRatioPseudoFermionAction &EOFA, + GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){ + std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl; + typename FermionImplPolicy::FermionField eta(FGrid); + RealD scale = std::sqrt(0.5); + gaussian(rng,eta); eta = eta * scale; + + //Use the inbuilt check + EOFA.refresh(latt, eta); + EOFA.S(latt); + std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl; +} + + +template +class EOFAlinop: public LinearOperatorBase{ + ExactOneFlavourRatioPseudoFermionAction &EOFA; + LatticeGaugeFieldD &U; +public: + EOFAlinop(ExactOneFlavourRatioPseudoFermionAction &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){} + + typedef typename FermionImplPolicy::FermionField Field; + void OpDiag (const Field &in, Field &out){ assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); } + void OpDirAll (const Field &in, std::vector &out){ assert(0); } + + void Op (const Field &in, Field &out){ assert(0); } + void AdjOp (const Field &in, Field &out){ assert(0); } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } + void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); } +}; + +template +void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction &EOFA, + GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){ + std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl; + EOFAlinop linop(EOFA, latt); + typename FermionImplPolicy::FermionField eta(FGrid); + gaussian(rng,eta); + PowerMethod power_method; + auto lambda_max = power_method(linop,eta); + std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl; +} + +//Applications of M^{-1} cost the same as M for EOFA! +template +class EOFAinvLinop: public LinearOperatorBase{ + ExactOneFlavourRatioPseudoFermionAction &EOFA; + LatticeGaugeFieldD &U; +public: + EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){} + + typedef typename FermionImplPolicy::FermionField Field; + void OpDiag (const Field &in, Field &out){ assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); } + void OpDirAll (const Field &in, std::vector &out){ assert(0); } + + void Op (const Field &in, Field &out){ assert(0); } + void AdjOp (const Field &in, Field &out){ assert(0); } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } + void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); } +}; + +template +void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction &EOFA, + GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){ + std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl; + EOFAinvLinop linop(EOFA, latt); + typename FermionImplPolicy::FermionField eta(FGrid); + gaussian(rng,eta); + PowerMethod power_method; + auto lambda_max = power_method(linop,eta); + std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl; +} + + +NAMESPACE_BEGIN(Grid); + + template + class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + Integer TotalInnerIterations; //Number of inner CG iterations + Integer TotalOuterIterations; //Number of restarts + Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + + MixedPrecisionConjugateGradientOperatorFunction(RealD tol, + Integer maxinnerit, + Integer maxouterit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + InnerTolerance(tol), + MaxInnerIterations(maxinnerit), + MaxOuterIterations(maxouterit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5), + OuterLoopNormMult(100.) + { + }; + + void operator()(LinearOperatorBase &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<(&LinOpU); + assert(&(SchurOpU->_Mat)==&(LinOpD._Mat)); + + precisionChange(FermOpF.Umu, FermOpD.Umu); + + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// + MixedPrecisionConjugateGradient MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); + MPCG.InnerTolerance = InnerTolerance; + std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < + class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + Integer MaxIterations; + + RealD Delta; //reliable update parameter + + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, + RealD delta, + Integer maxit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + Delta(delta), + MaxIterations(maxit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5) + { + }; + + void operator()(LinearOperatorBase &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<(&LinOpU); + assert(&(SchurOpU->_Mat)==&(LinOpD._Mat)); + + precisionChange(FermOpF.Umu, FermOpD.Umu); + + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// + + ConjugateGradientReliableUpdate MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD); + std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" < tmp; + GridCmdOptionIntVector(argv[i+1],tmp); + { + std::stringstream ss; + for(int j=0;j MixedPrecRHMC; + typedef GeneralEvenOddRatioRationalPseudoFermionAction DoublePrecRHMC; + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + IntegratorParameters MD; + typedef ConjugateHMCRunnerD HMCWrapper; //NB: This is the "Omelyan integrator" + MD.name = std::string("MinimumNorm2"); + + // typedef ConjugateHMCRunnerD HMCWrapper; + // MD.name = std::string("ForceGradient"); + + MD.MDsteps = user_params.Steps; + MD.trajL = user_params.TrajectoryLength; + + typedef HMCWrapper::ImplPolicy GaugeImplPolicy; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = user_params.StartTrajectory; + HMCparams.Trajectories = user_params.Trajectories; + HMCparams.NoMetropolisUntil= 0; + HMCparams.StartingType = user_params.StartingType; + HMCparams.MetropolisTest = user_params.MetropolisTest; + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_lat"; + CPparams.rng_prefix = "ckpoint_rng"; + CPparams.saveInterval = user_params.SaveInterval; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = serial_seeds; + RNGpar.parallel_seeds = parallel_seeds; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + //aiming for ainv=1.723 GeV + // me bob + //Estimated a(ml+mres) [40ID] = 0.001305 0.00131 + // a(mh+mres) [40ID] = 0.035910 0.03529 + //Estimate Ls=12, b+c=2 mres~0.0011 + + //1/24/2022 initial mres measurement gives mres=0.001, adjusted light quark mass to 0.0003 from 0.0001 + + const int Ls = 12; + Real beta = 1.848; + Real light_mass = 0.0003; + Real strange_mass = 0.0342; + Real pv_mass = 1.0; + RealD M5 = 1.8; + RealD mobius_scale = 2.; //b+c + + RealD mob_bmc = 1.0; + RealD mob_b = (mobius_scale + mob_bmc)/2.; + RealD mob_c = (mobius_scale - mob_bmc)/2.; + + std::cout << GridLogMessage + << "Ensemble parameters:" << std::endl + << "Ls=" << Ls << std::endl + << "beta=" << beta << std::endl + << "light_mass=" << light_mass << std::endl + << "strange_mass=" << strange_mass << std::endl + << "mobius_scale=" << mobius_scale << std::endl; + + //Setup the Grids + auto UGridD = TheHMC.Resources.GetCartesian(); + auto UrbGridD = TheHMC.Resources.GetRBCartesian(); + auto FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD); + auto FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD); + + GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF); + auto FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF); + auto FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF); + + ConjugateIwasakiGaugeActionD GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeFieldD Ud(UGridD); + LatticeGaugeFieldF Uf(UGridF); + + //Setup the BCs + FermionActionD::ImplParams Params; + for(int i=0;i dirs4(Nd); + for(int i=0;i Level1(1); //light quark + strange quark + ActionLevel Level2(4); //DSDR + ActionLevel Level3(2); //gauge + + + ///////////////////////////////////////////////////////////// + // Light EOFA action + // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc + ///////////////////////////////////////////////////////////// + typedef SchurDiagMooeeOperator EOFAschuropD; + typedef SchurDiagMooeeOperator EOFAschuropF; + typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction EOFAmixPrecPFaction; + typedef MixedPrecisionConjugateGradientOperatorFunction EOFA_mxCG; + typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction EOFA_relupCG; + + + std::vector eofa_light_masses = { light_mass , 0.004, 0.016, 0.064, 0.256 }; + std::vector eofa_pv_masses = { 0.004 , 0.016, 0.064, 0.256, 1.0 }; + int n_light_hsb = 5; + assert(user_params.eofa_l.size() == n_light_hsb); + + EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb]; + + for(int i=0;iInnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance; + + EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D); + ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance; + + EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D); + DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance; + + EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 50000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D); + DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance; + + std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl; + std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl; +#endif + + EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF, + *LopD, *RopD, + *ActionMCG_L, *ActionMCG_R, + *ActionMCG_L, *ActionMCG_R, + *DerivMCG_L, *DerivMCG_R, + user_params.eofa_l[i].rat_params, true); + EOFA_pfactions[i] = EOFA; + Level1.push_back(EOFA); + } + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params); + FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params); + + FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params); + FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params); + + RationalActionParams rat_act_params_s; + rat_act_params_s.inv_pow = 4; // (M^dag M)^{1/4} + rat_act_params_s.precision= 60; + rat_act_params_s.MaxIter = 50000; + user_params.rat_quo_s.Export(rat_act_params_s); + std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl; + + //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); + DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); + Level1.push_back(&Quotient_s); + + /////////////////////////////////// + // DSDR action + /////////////////////////////////// + RealD dsdr_mass=-1.8; + //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf + RealD dsdr_epsilon_f = 0.02; //numerator (in determinant) + RealD dsdr_epsilon_b = 0.5; + GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params); + GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params); + + GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params); + GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params); + + RationalActionParams rat_act_params_DSDR; + rat_act_params_DSDR.inv_pow = 2; // (M^dag M)^{1/2} + rat_act_params_DSDR.precision= 60; + rat_act_params_DSDR.MaxIter = 50000; + user_params.rat_quo_DSDR.Export(rat_act_params_DSDR); + std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl; + + DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR); + Level2.push_back(&Quotient_DSDR); + + ///////////////////////////////////////////////////////////// + // Gauge action + ///////////////////////////////////////////////////////////// + Level3.push_back(&GaugeAction); + + TheHMC.TheAction.push_back(Level1); + TheHMC.TheAction.push_back(Level2); + TheHMC.TheAction.push_back(Level3); + std::cout << GridLogMessage << " Action complete "<< std::endl; + + + //Action tuning + bool + tune_rhmc_s=false, eigenrange_s=false, + tune_rhmc_DSDR=false, eigenrange_DSDR=false, + check_eofa=false, + upper_bound_eofa=false, lower_bound_eofa(false); + + std::string lanc_params_s; + std::string lanc_params_DSDR; + int tune_rhmc_s_action_or_md; + int tune_rhmc_DSDR_action_or_md; + int eofa_which_hsb; + + for(int i=1;i= 0 && eofa_which_hsb < n_light_hsb) ); + } + else if(sarg == "--upper_bound_eofa"){ + assert(i < argc-1); + upper_bound_eofa = true; + eofa_which_hsb = std::stoi(argv[i+1]); + assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb); + } + else if(sarg == "--lower_bound_eofa"){ + assert(i < argc-1); + lower_bound_eofa = true; + eofa_which_hsb = std::stoi(argv[i+1]); + assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb); + } + } + if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) { + std::cout << GridLogMessage << "Running checks" << std::endl; + TheHMC.initializeGaugeFieldAndRNGs(Ud); + + //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl; + //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl; + + if(check_eofa){ + if(eofa_which_hsb >= 0){ + std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl; + checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud); + std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl; + }else{ + for(int i=0;i(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG()); + if(tune_rhmc_s) checkRHMC(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange", tune_rhmc_s_action_or_md); + if(eigenrange_DSDR) computeEigenvalues(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG()); + if(tune_rhmc_DSDR) checkRHMC(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md); + + + std::cout << GridLogMessage << " Done" << std::endl; + Grid_finalize(); + return 0; + } + + + //Run the HMC + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.Run(); + + std::cout << GridLogMessage << " Done" << std::endl; + Grid_finalize(); + return 0; +} // main diff --git a/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc b/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc new file mode 100644 index 00000000..42f54edd --- /dev/null +++ b/HMC/Mobius2p1fIDSDRGparityEOFA_48ID.cc @@ -0,0 +1,873 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./HMC/Mobius2p1fIDSDRGparityEOFA.cc + +Copyright (C) 2015-2016 + +Author: Christopher Kelly +Author: Peter Boyle + + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +using namespace Grid; + +//Production binary for the 40ID G-parity ensemble + +struct RatQuoParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(RatQuoParameters, + double, bnd_lo, + double, bnd_hi, + Integer, action_degree, + double, action_tolerance, + Integer, md_degree, + double, md_tolerance, + Integer, reliable_update_freq, + Integer, bnd_check_freq); + RatQuoParameters() { + bnd_lo = 1e-2; + bnd_hi = 30; + action_degree = 10; + action_tolerance = 1e-10; + md_degree = 10; + md_tolerance = 1e-8; + bnd_check_freq = 20; + reliable_update_freq = 50; + } + + void Export(RationalActionParams &into) const{ + into.lo = bnd_lo; + into.hi = bnd_hi; + into.action_degree = action_degree; + into.action_tolerance = action_tolerance; + into.md_degree = md_degree; + into.md_tolerance = md_tolerance; + into.BoundsCheckFreq = bnd_check_freq; + } +}; + +struct EOFAparameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(EOFAparameters, + OneFlavourRationalParams, rat_params, + double, action_tolerance, + double, action_mixcg_inner_tolerance, + double, md_tolerance, + double, md_mixcg_inner_tolerance); + + EOFAparameters() { + action_mixcg_inner_tolerance = 1e-8; + action_tolerance = 1e-10; + md_tolerance = 1e-8; + md_mixcg_inner_tolerance = 1e-8; + + rat_params.lo = 1.0; + rat_params.hi = 25.0; + rat_params.MaxIter = 10000; + rat_params.tolerance= 1.0e-9; + rat_params.degree = 14; + rat_params.precision= 50; + } +}; + +struct EvolParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(EvolParameters, + Integer, StartTrajectory, + Integer, Trajectories, + Integer, SaveInterval, + Integer, Steps, + RealD, TrajectoryLength, + bool, MetropolisTest, + std::string, StartingType, + std::vector, GparityDirs, + std::vector, eofa_l, + RatQuoParameters, rat_quo_s, + RatQuoParameters, rat_quo_DSDR); + + EvolParameters() { + //For initial thermalization; afterwards user should switch Metropolis on and use StartingType=CheckpointStart + MetropolisTest = false; + StartTrajectory = 0; + Trajectories = 50; + SaveInterval = 5; + StartingType = "ColdStart"; + GparityDirs.resize(3, 1); //1 for G-parity, 0 for periodic + Steps = 5; + TrajectoryLength = 1.0; + } +}; + +bool fileExists(const std::string &fn){ + std::ifstream f(fn); + return f.good(); +} + + + + +struct LanczosParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters, + double, alpha, + double, beta, + double, mu, + int, ord, + int, n_stop, + int, n_want, + int, n_use, + double, tolerance); + + LanczosParameters() { + alpha = 35; + beta = 5; + mu = 0; + ord = 100; + n_stop = 10; + n_want = 10; + n_use = 15; + tolerance = 1e-6; + } +}; + + + +template +void computeEigenvalues(std::string param_file, + GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt, //expect lattice to have been initialized to something + FermionActionD &action, GridParallelRNG &rng){ + + LanczosParameters params; + if(fileExists(param_file)){ + std::cout << GridLogMessage << " Reading " << param_file << std::endl; + Grid::XmlReader rd(param_file); + read(rd, "LanczosParameters", params); + }else if(!GlobalSharedMemory::WorldRank){ + std::cout << GridLogMessage << " File " << param_file << " does not exist" << std::endl; + std::cout << GridLogMessage << " Writing xml template to " << param_file << ".templ" << std::endl; + Grid::XmlWriter wr(param_file + ".templ"); + write(wr, "LanczosParameters", params); + } + + FermionFieldD gauss_o(rbGrid); + FermionFieldD gauss(Grid); + gaussian(rng, gauss); + pickCheckerboard(Odd, gauss_o, gauss); + + action.ImportGauge(latt); + + SchurDiagMooeeOperator hermop(action); + PlainHermOp hermop_wrap(hermop); + //ChebyshevLanczos Cheb(params.alpha, params.beta, params.mu, params.ord); + assert(params.mu == 0.0); + + Chebyshev Cheb(params.beta*params.beta, params.alpha*params.alpha, params.ord+1); + FunctionHermOp Cheb_wrap(Cheb, hermop); + + std::cout << "IRL: alpha=" << params.alpha << " beta=" << params.beta << " mu=" << params.mu << " ord=" << params.ord << std::endl; + ImplicitlyRestartedLanczos IRL(Cheb_wrap, hermop_wrap, params.n_stop, params.n_want, params.n_use, params.tolerance, 10000); + + std::vector eval(params.n_use); + std::vector evec(params.n_use, rbGrid); + int Nconv; + IRL.calc(eval, evec, gauss_o, Nconv); + + std::cout << "Eigenvalues:" << std::endl; + for(int i=0;i +void checkRHMC(GridCartesian* Grid, GridRedBlackCartesian* rbGrid, const LatticeGaugeFieldD &latt, //expect lattice to have been initialized to something + FermionActionD &numOp, FermionActionD &denOp, RHMCtype &rhmc, GridParallelRNG &rng, + int inv_pow, const std::string &quark_descr, int action_or_md){ + assert(action_or_md == 0 || action_or_md == 1 || action_or_md == 2); + + FermionFieldD gauss_o(rbGrid); + FermionFieldD gauss(Grid); + gaussian(rng, gauss); + pickCheckerboard(Odd, gauss_o, gauss); + + numOp.ImportGauge(latt); + denOp.ImportGauge(latt); + + typedef typename FermionActionD::Impl_t FermionImplPolicyD; + SchurDifferentiableOperator MdagM(numOp); + SchurDifferentiableOperator VdagV(denOp); + + PowerMethod power_method; + RealD lambda_max; + + std::cout << "Starting: Get RHMC high bound approx for " << quark_descr << " numerator" << std::endl; + + lambda_max = power_method(MdagM,gauss_o); + std::cout << GridLogMessage << "Got lambda_max "< +void checkEOFA(ExactOneFlavourRatioPseudoFermionAction &EOFA, + GridCartesian* FGrid, GridParallelRNG &rng, const LatticeGaugeFieldD &latt){ + std::cout << GridLogMessage << "Starting EOFA action/bounds check" << std::endl; + typename FermionImplPolicy::FermionField eta(FGrid); + RealD scale = std::sqrt(0.5); + gaussian(rng,eta); eta = eta * scale; + + //Use the inbuilt check + EOFA.refresh(latt, eta); + EOFA.S(latt); + std::cout << GridLogMessage << "Finished EOFA upper action/bounds check" << std::endl; +} + + +template +class EOFAlinop: public LinearOperatorBase{ + ExactOneFlavourRatioPseudoFermionAction &EOFA; + LatticeGaugeFieldD &U; +public: + EOFAlinop(ExactOneFlavourRatioPseudoFermionAction &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){} + + typedef typename FermionImplPolicy::FermionField Field; + void OpDiag (const Field &in, Field &out){ assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); } + void OpDirAll (const Field &in, std::vector &out){ assert(0); } + + void Op (const Field &in, Field &out){ assert(0); } + void AdjOp (const Field &in, Field &out){ assert(0); } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } + void HermOp(const Field &in, Field &out){ EOFA.Meofa(U, in, out); } +}; + +template +void upperBoundEOFA(ExactOneFlavourRatioPseudoFermionAction &EOFA, + GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){ + std::cout << GridLogMessage << "Starting EOFA upper bound compute" << std::endl; + EOFAlinop linop(EOFA, latt); + typename FermionImplPolicy::FermionField eta(FGrid); + gaussian(rng,eta); + PowerMethod power_method; + auto lambda_max = power_method(linop,eta); + std::cout << GridLogMessage << "Upper bound of EOFA operator " << lambda_max << std::endl; +} + +//Applications of M^{-1} cost the same as M for EOFA! +template +class EOFAinvLinop: public LinearOperatorBase{ + ExactOneFlavourRatioPseudoFermionAction &EOFA; + LatticeGaugeFieldD &U; +public: + EOFAinvLinop(ExactOneFlavourRatioPseudoFermionAction &EOFA, LatticeGaugeFieldD &U): EOFA(EOFA), U(U){} + + typedef typename FermionImplPolicy::FermionField Field; + void OpDiag (const Field &in, Field &out){ assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); } + void OpDirAll (const Field &in, std::vector &out){ assert(0); } + + void Op (const Field &in, Field &out){ assert(0); } + void AdjOp (const Field &in, Field &out){ assert(0); } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } + void HermOp(const Field &in, Field &out){ EOFA.MeofaInv(U, in, out); } +}; + +template +void lowerBoundEOFA(ExactOneFlavourRatioPseudoFermionAction &EOFA, + GridCartesian* FGrid, GridParallelRNG &rng, LatticeGaugeFieldD &latt){ + std::cout << GridLogMessage << "Starting EOFA lower bound compute using power method on M^{-1}. Inverse of highest eigenvalue is the lowest eigenvalue of M" << std::endl; + EOFAinvLinop linop(EOFA, latt); + typename FermionImplPolicy::FermionField eta(FGrid); + gaussian(rng,eta); + PowerMethod power_method; + auto lambda_max = power_method(linop,eta); + std::cout << GridLogMessage << "Lower bound of EOFA operator " << 1./lambda_max << std::endl; +} + + +NAMESPACE_BEGIN(Grid); + + template + class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + Integer TotalInnerIterations; //Number of inner CG iterations + Integer TotalOuterIterations; //Number of restarts + Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + + MixedPrecisionConjugateGradientOperatorFunction(RealD tol, + Integer maxinnerit, + Integer maxouterit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + InnerTolerance(tol), + MaxInnerIterations(maxinnerit), + MaxOuterIterations(maxouterit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5), + OuterLoopNormMult(100.) + { + }; + + void operator()(LinearOperatorBase &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<(&LinOpU); + assert(&(SchurOpU->_Mat)==&(LinOpD._Mat)); + + precisionChange(FermOpF.Umu, FermOpD.Umu); + + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// + MixedPrecisionConjugateGradient MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); + MPCG.InnerTolerance = InnerTolerance; + std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < + class MixedPrecisionReliableUpdateConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + Integer MaxIterations; + + RealD Delta; //reliable update parameter + + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + MixedPrecisionReliableUpdateConjugateGradientOperatorFunction(RealD tol, + RealD delta, + Integer maxit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + Delta(delta), + MaxIterations(maxit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5) + { + }; + + void operator()(LinearOperatorBase &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision reliable CG update wrapper operator() "<(&LinOpU); + assert(&(SchurOpU->_Mat)==&(LinOpD._Mat)); + + precisionChange(FermOpF.Umu, FermOpD.Umu); + + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// + + ConjugateGradientReliableUpdate MPCG(Tolerance,MaxIterations,Delta,SinglePrecGrid5,LinOpF,LinOpD); + std::cout << GridLogMessage << "Calling mixed precision reliable update Conjugate Gradient" < MixedPrecRHMC; + typedef GeneralEvenOddRatioRationalPseudoFermionAction DoublePrecRHMC; + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + IntegratorParameters MD; + typedef ConjugateHMCRunnerD HMCWrapper; //NB: This is the "Omelyan integrator" + typedef HMCWrapper::ImplPolicy GaugeImplPolicy; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = user_params.Steps; + MD.trajL = user_params.TrajectoryLength; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = user_params.StartTrajectory; + HMCparams.Trajectories = user_params.Trajectories; + HMCparams.NoMetropolisUntil= 0; + HMCparams.StartingType = user_params.StartingType; + HMCparams.MetropolisTest = user_params.MetropolisTest; + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_lat"; + CPparams.rng_prefix = "ckpoint_rng"; + CPparams.saveInterval = user_params.SaveInterval; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + //Note that checkpointing saves the RNG state so that this initialization is required only for the very first configuration + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + //aiming for ainv=2.068 me Bob + //Estimated a(ml+mres) [48ID] = 0.001048 0.00104 + // a(mh+mres) [48ID] = 0.028847 0.02805 + //Estimate Ls=12, b+c=2 mres~0.0003 + + const int Ls = 12; + Real beta = 1.946; + Real light_mass = 0.00074; //0.00104 - mres_approx; + Real strange_mass = 0.02775; //0.02805 - mres_approx + Real pv_mass = 1.0; + RealD M5 = 1.8; + RealD mobius_scale = 2.; //b+c + + RealD mob_bmc = 1.0; + RealD mob_b = (mobius_scale + mob_bmc)/2.; + RealD mob_c = (mobius_scale - mob_bmc)/2.; + + //Setup the Grids + auto UGridD = TheHMC.Resources.GetCartesian(); + auto UrbGridD = TheHMC.Resources.GetRBCartesian(); + auto FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD); + auto FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD); + + GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF); + auto FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF); + auto FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF); + + ConjugateIwasakiGaugeActionD GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeFieldD Ud(UGridD); + LatticeGaugeFieldF Uf(UGridF); + + //Setup the BCs + FermionActionD::ImplParams Params; + for(int i=0;i dirs4(Nd); + for(int i=0;i Level1(1); //light quark + strange quark + ActionLevel Level2(4); //DSDR + ActionLevel Level3(2); //gauge + + + ///////////////////////////////////////////////////////////// + // Light EOFA action + // have to be careful with the parameters, cf. Test_dwf_gpforce_eofa.cc + ///////////////////////////////////////////////////////////// + typedef SchurDiagMooeeOperator EOFAschuropD; + typedef SchurDiagMooeeOperator EOFAschuropF; + typedef ExactOneFlavourRatioMixedPrecHeatbathPseudoFermionAction EOFAmixPrecPFaction; + typedef MixedPrecisionConjugateGradientOperatorFunction EOFA_mxCG; + typedef MixedPrecisionReliableUpdateConjugateGradientOperatorFunction EOFA_relupCG; + + std::vector eofa_light_masses = { light_mass , 0.004, 0.016, 0.064, 0.256 }; + std::vector eofa_pv_masses = { 0.004 , 0.016, 0.064, 0.256, 1.0 }; + int n_light_hsb = 5; + assert(user_params.eofa_l.size() == n_light_hsb); + + EOFAmixPrecPFaction* EOFA_pfactions[n_light_hsb]; + + for(int i=0;iInnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance; + + EOFA_mxCG* ActionMCG_R = new EOFA_mxCG(user_params.eofa_l[i].action_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D); + ActionMCG_R->InnerTolerance = user_params.eofa_l[i].action_mixcg_inner_tolerance; + + EOFA_mxCG* DerivMCG_L = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *LopF, *LopD, *linopL_F, *linopL_D); + DerivMCG_L->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance; + + EOFA_mxCG* DerivMCG_R = new EOFA_mxCG(user_params.eofa_l[i].md_tolerance, 10000, 1000, UGridF, FrbGridF, *RopF, *RopD, *linopR_F, *linopR_D); + DerivMCG_R->InnerTolerance = user_params.eofa_l[i].md_mixcg_inner_tolerance; + + std::cout << GridLogMessage << "Set EOFA action solver action tolerance outer=" << ActionMCG_L->Tolerance << " inner=" << ActionMCG_L->InnerTolerance << std::endl; + std::cout << GridLogMessage << "Set EOFA MD solver tolerance outer=" << DerivMCG_L->Tolerance << " inner=" << DerivMCG_L->InnerTolerance << std::endl; +#endif + + + EOFAmixPrecPFaction* EOFA = new EOFAmixPrecPFaction(*LopF, *RopF, + *LopD, *RopD, + *ActionMCG_L, *ActionMCG_R, + *ActionMCG_L, *ActionMCG_R, + *DerivMCG_L, *DerivMCG_R, + user_params.eofa_l[i].rat_params, true); + EOFA_pfactions[i] = EOFA; + Level1.push_back(EOFA); + } + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionActionD Numerator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD,strange_mass,M5,mob_b,mob_c,Params); + FermionActionD Denominator_sD(Ud,*FGridD,*FrbGridD,*UGridD,*UrbGridD, pv_mass,M5,mob_b,mob_c,Params); + + FermionActionF Numerator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF,strange_mass,M5,mob_b,mob_c,Params); + FermionActionF Denominator_sF(Uf,*FGridF,*FrbGridF,*UGridF,*UrbGridF, pv_mass,M5,mob_b,mob_c,Params); + + RationalActionParams rat_act_params_s; + rat_act_params_s.inv_pow = 4; // (M^dag M)^{1/4} + rat_act_params_s.precision= 60; + rat_act_params_s.MaxIter = 10000; + user_params.rat_quo_s.Export(rat_act_params_s); + std::cout << GridLogMessage << " Heavy quark bounds check every " << rat_act_params_s.BoundsCheckFreq << " trajectories (avg)" << std::endl; + + //MixedPrecRHMC Quotient_s(Denominator_sD, Numerator_sD, Denominator_sF, Numerator_sF, rat_act_params_s, user_params.rat_quo_s.reliable_update_freq); + DoublePrecRHMC Quotient_s(Denominator_sD, Numerator_sD, rat_act_params_s); + Level1.push_back(&Quotient_s); + + /////////////////////////////////// + // DSDR action + /////////////////////////////////// + RealD dsdr_mass=-1.8; + //Use same DSDR twists as https://arxiv.org/pdf/1208.4412.pdf + RealD dsdr_epsilon_f = 0.02; //numerator (in determinant) + RealD dsdr_epsilon_b = 0.5; + GparityWilsonTMFermionD Numerator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_f, Params); + GparityWilsonTMFermionF Numerator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_f, Params); + + GparityWilsonTMFermionD Denominator_DSDR_D(Ud, *UGridD, *UrbGridD, dsdr_mass, dsdr_epsilon_b, Params); + GparityWilsonTMFermionF Denominator_DSDR_F(Uf, *UGridF, *UrbGridF, dsdr_mass, dsdr_epsilon_b, Params); + + RationalActionParams rat_act_params_DSDR; + rat_act_params_DSDR.inv_pow = 2; // (M^dag M)^{1/2} + rat_act_params_DSDR.precision= 60; + rat_act_params_DSDR.MaxIter = 10000; + user_params.rat_quo_DSDR.Export(rat_act_params_DSDR); + std::cout << GridLogMessage << "DSDR quark bounds check every " << rat_act_params_DSDR.BoundsCheckFreq << " trajectories (avg)" << std::endl; + + DoublePrecRHMC Quotient_DSDR(Denominator_DSDR_D, Numerator_DSDR_D, rat_act_params_DSDR); + Level2.push_back(&Quotient_DSDR); + + ///////////////////////////////////////////////////////////// + // Gauge action + ///////////////////////////////////////////////////////////// + Level3.push_back(&GaugeAction); + + TheHMC.TheAction.push_back(Level1); + TheHMC.TheAction.push_back(Level2); + TheHMC.TheAction.push_back(Level3); + std::cout << GridLogMessage << " Action complete "<< std::endl; + + + //Action tuning + bool + tune_rhmc_s=false, eigenrange_s=false, + tune_rhmc_DSDR=false, eigenrange_DSDR=false, + check_eofa=false, + upper_bound_eofa=false, lower_bound_eofa(false); + + std::string lanc_params_s; + std::string lanc_params_DSDR; + int tune_rhmc_s_action_or_md; + int tune_rhmc_DSDR_action_or_md; + int eofa_which_hsb; + + for(int i=1;i= 0 && eofa_which_hsb < n_light_hsb) ); + } + else if(sarg == "--upper_bound_eofa"){ + assert(i < argc-1); + upper_bound_eofa = true; + eofa_which_hsb = std::stoi(argv[i+1]); + assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb); + } + else if(sarg == "--lower_bound_eofa"){ + assert(i < argc-1); + lower_bound_eofa = true; + eofa_which_hsb = std::stoi(argv[i+1]); + assert(eofa_which_hsb >= 0 && eofa_which_hsb < n_light_hsb); + } + } + if(tune_rhmc_s || eigenrange_s || tune_rhmc_DSDR || eigenrange_DSDR ||check_eofa || upper_bound_eofa || lower_bound_eofa) { + std::cout << GridLogMessage << "Running checks" << std::endl; + TheHMC.initializeGaugeFieldAndRNGs(Ud); + + //std::cout << GridLogMessage << "EOFA action solver action tolerance outer=" << ActionMCG_L.Tolerance << " inner=" << ActionMCG_L.InnerTolerance << std::endl; + //std::cout << GridLogMessage << "EOFA MD solver tolerance outer=" << DerivMCG_L.Tolerance << " inner=" << DerivMCG_L.InnerTolerance << std::endl; + + + if(check_eofa){ + if(eofa_which_hsb >= 0){ + std::cout << GridLogMessage << "Starting checking EOFA Hasenbusch " << eofa_which_hsb << std::endl; + checkEOFA(*EOFA_pfactions[eofa_which_hsb], FGridD, TheHMC.Resources.GetParallelRNG(), Ud); + std::cout << GridLogMessage << "Finished checking EOFA Hasenbusch " << eofa_which_hsb << std::endl; + }else{ + for(int i=0;i(lanc_params_s, FGridD, FrbGridD, Ud, Numerator_sD, TheHMC.Resources.GetParallelRNG()); + if(tune_rhmc_s) checkRHMC(FGridD, FrbGridD, Ud, Numerator_sD, Denominator_sD, Quotient_s, TheHMC.Resources.GetParallelRNG(), 4, "strange", tune_rhmc_s_action_or_md); + if(eigenrange_DSDR) computeEigenvalues(lanc_params_DSDR, UGridD, UrbGridD, Ud, Numerator_DSDR_D, TheHMC.Resources.GetParallelRNG()); + if(tune_rhmc_DSDR) checkRHMC(UGridD, UrbGridD, Ud, Numerator_DSDR_D, Denominator_DSDR_D, Quotient_DSDR, TheHMC.Resources.GetParallelRNG(), 2, "DSDR", tune_rhmc_DSDR_action_or_md); + + + std::cout << GridLogMessage << " Done" << std::endl; + Grid_finalize(); + return 0; + } + + + //Run the HMC + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.Run(); + + std::cout << GridLogMessage << " Done" << std::endl; + Grid_finalize(); + return 0; +} // main diff --git a/tests/IO/Test_field_array_io.cc b/tests/IO/Test_field_array_io.cc new file mode 100644 index 00000000..51ea7893 --- /dev/null +++ b/tests/IO/Test_field_array_io.cc @@ -0,0 +1,184 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/IO/Test_field_array_io.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +//This test demonstrates and checks a single-file write of an arbitrary array of fields + +uint64_t writeHeader(const uint32_t size, const uint32_t checksum, const std::string &format, const std::string &file){ + std::ofstream fout(file,std::ios::out|std::ios::in); + fout.seekp(0,std::ios::beg); + fout << std::setw(10) << size << std::endl; + fout << std::hex << std::setw(10) << checksum << std::endl; + fout << format << std::endl; + return fout.tellp(); +} + +uint64_t readHeader(uint32_t &size, uint32_t &checksum, std::string &format, const std::string &file){ + std::ifstream fin(file); + std::string line; + getline(fin,line); + { + std::stringstream ss; ss <> size; + } + getline(fin,line); + { + std::stringstream ss; ss <> std::hex >> checksum; + } + getline(fin,format); + removeWhitespace(format); + + return fin.tellg(); +} + +template +void writeFieldArray(const std::string &file, const std::vector &data){ + typedef typename FieldType::vector_object vobj; + typedef typename FieldType::scalar_object sobj; + GridBase* grid = data[0].Grid(); //assume all fields have the same Grid + BinarySimpleMunger munge; //straight copy + + //We need a 2-pass header write, first to establish the size, the second pass writes the checksum + std::string format = getFormatString(); + + uint64_t offset; //leave 64 bits for header + if ( grid->IsBoss() ) { + NerscIO::truncate(file); + offset = writeHeader(data.size(), 0, format, file); + } + grid->Broadcast(0,(void *)&offset,sizeof(offset)); //use as a barrier + + std::cout << "Data offset write " << offset << std::endl; + std::cout << "Data size write " << data.size() << std::endl; + uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj); + std::cout << "Field size = " << field_size << " B" << std::endl; + + uint32_t checksum = 0; + for(int i=0;i(const_cast(data[i]),file,munge,offset,format, + nersc_csum,scidac_csuma,scidac_csumb); + offset += field_size; + checksum ^= nersc_csum + 0x9e3779b9 + (checksum<<6) + (checksum>>2); + } + std::cout << "Write checksum " << checksum << std::endl; + + if ( grid->IsBoss() ) { + writeHeader(data.size(), checksum, format, file); + } +} + + +template +void readFieldArray(std::vector &data, const std::string &file){ + typedef typename FieldType::vector_object vobj; + typedef typename FieldType::scalar_object sobj; + assert(data.size() > 0); + GridBase* grid = data[0].Grid(); //assume all fields have the same Grid + BinarySimpleUnmunger munge; //straight copy + + uint32_t hdr_checksum, hdr_size; + std::string format; + uint64_t offset = readHeader(hdr_size, hdr_checksum, format, file); + + std::cout << "Data offset read " << offset << std::endl; + std::cout << "Data size read " << hdr_size << std::endl; + assert(data.size() == hdr_size); + + uint64_t field_size = uint64_t(grid->gSites()) * sizeof(sobj); + + uint32_t checksum = 0; + + for(int i=0;i(data[i],file,munge,offset,format, + nersc_csum,scidac_csuma,scidac_csumb); + offset += field_size; + checksum ^= nersc_csum + 0x9e3779b9 + (checksum<<6) + (checksum>>2); + } + + std::cout << "Header checksum " << hdr_checksum << std::endl; + std::cout << "Read checksum " << checksum << std::endl; + + + assert( hdr_checksum == checksum ); +} + + + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + const int Ls=8; + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt, simd_layout, mpi_layout); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + typedef DomainWallFermionD::FermionField FermionField; + + int nfield = 20; + std::vector data(nfield, FGrid); + + for(int i=0;i data_r(nfield, FGrid); + readFieldArray(data_r, file); + + for(int i=0;i +Author: Leans heavily on Christoph Lehner's code +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +/* + * Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features + * in Grid that were intended to be used to support blocked Aggregates, from + */ +#include +#include +#include + +using namespace std; +using namespace Grid; + +//For the CPS configurations we have to manually seed the RNG and deal with an incorrect factor of 2 in the plaquette metadata +void readConfiguration(LatticeGaugeFieldD &U, + const std::string &config, + bool is_cps_cfg = false){ + + if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = false; + + typedef GaugeStatistics GaugeStats; + + FieldMetaData header; + NerscIO::readConfiguration(U, header, config); + + if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = true; +} + +//Lanczos parameters in CPS conventions +struct CPSLanczosParams : Serializable { +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(CPSLanczosParams, + RealD, alpha, + RealD, beta, + int, ch_ord, + int, N_use, + int, N_get, + int, N_true_get, + RealD, stop_rsd, + int, maxits); + + //Translations + ChebyParams getChebyParams() const{ + ChebyParams out; + out.alpha = beta*beta; //aka lo + out.beta = alpha*alpha; //aka hi + out.Npoly = ch_ord+1; + return out; + } + int Nstop() const{ return N_true_get; } + int Nm() const{ return N_use; } + int Nk() const{ return N_get; } +}; + +//Maybe this class should be in the main library? +template +class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos +{ +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseField; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + LocalCoherenceLanczosScidac(GridBase *FineGrid,GridBase *CoarseGrid, + LinearOperatorBase &FineOp, + int checkerboard) + // Base constructor + : LocalCoherenceLanczos(FineGrid,CoarseGrid,FineOp,checkerboard) + {}; + + void checkpointFine(std::string evecs_file,std::string evals_file) + { + assert(this->subspace.size()==nbasis); + emptyUserRecord record; + Grid::ScidacWriter WR(this->_FineGrid->IsBoss()); + WR.open(evecs_file); + for(int k=0;ksubspace[k],record); + } + WR.close(); + + XmlWriter WRx(evals_file); + write(WRx,"evals",this->evals_fine); + } + + void checkpointFineRestore(std::string evecs_file,std::string evals_file) + { + this->evals_fine.resize(nbasis); + this->subspace.resize(nbasis,this->_FineGrid); + + std::cout << GridLogIRL<< "checkpointFineRestore: Reading evals from "<evals_fine); + + if(this->evals_fine.size() < nbasis) assert(0 && "Not enough fine evals to complete basis"); + if(this->evals_fine.size() > nbasis){ //allow the use of precomputed evecs with a larger #evecs + std::cout << GridLogMessage << "Truncating " << this->evals_fine.size() << " evals to basis size " << nbasis << std::endl; + this->evals_fine.resize(nbasis); + } + + std::cout << GridLogIRL<< "checkpointFineRestore: Reading evecs from "<subspace[k].Checkerboard()=this->_checkerboard; + RD.readScidacFieldRecord(this->subspace[k],record); + + } + RD.close(); + } + + void checkpointCoarse(std::string evecs_file,std::string evals_file) + { + int n = this->evec_coarse.size(); + emptyUserRecord record; + Grid::ScidacWriter WR(this->_CoarseGrid->IsBoss()); + WR.open(evecs_file); + for(int k=0;kevec_coarse[k],record); + } + WR.close(); + + XmlWriter WRx(evals_file); + write(WRx,"evals",this->evals_coarse); + } + + void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec) + { + std::cout << "resizing coarse vecs to " << nvec<< std::endl; + this->evals_coarse.resize(nvec); + this->evec_coarse.resize(nvec,this->_CoarseGrid); + std::cout << GridLogIRL<< "checkpointCoarseRestore: Reading evals from "<evals_coarse); + + assert(this->evals_coarse.size()==nvec); + emptyUserRecord record; + std::cout << GridLogIRL<< "checkpointCoarseRestore: Reading evecs from "<evec_coarse[k],record); + } + RD.close(); + } +}; + +struct Options{ + std::vector blockSize; + std::vector GparityDirs; + int Ls; + RealD mass; + RealD M5; + RealD mobius_scale; + std::string config; + bool is_cps_cfg; + + double coarse_relax_tol; + int smoother_ord; + + CPSLanczosParams fine; + CPSLanczosParams coarse; + + bool write_fine = false; + std::string write_fine_file; + + bool read_fine = false; + std::string read_fine_file; + + bool write_coarse = false; + std::string write_coarse_file; + + bool read_coarse = false; + std::string read_coarse_file; + + + Options(){ + blockSize = std::vector ({2,2,2,2,2}); + GparityDirs = std::vector ({1,1,1}); //1 for each GP direction + + Ls = 12; + mass = 0.01; + M5 = 1.8; + is_cps_cfg = false; + mobius_scale = 2.0; + + fine.alpha = 2; + fine.beta = 0.1; + fine.ch_ord = 100; + fine.N_use = 70; + fine.N_get = 60; + fine.N_true_get = 60; + fine.stop_rsd = 1e-8; + fine.maxits = 10000; + + coarse.alpha = 2; + coarse.beta = 0.1; + coarse.ch_ord = 100; + coarse.N_use = 200; + coarse.N_get = 190; + coarse.N_true_get = 190; + coarse.stop_rsd = 1e-8; + coarse.maxits = 10000; + + coarse_relax_tol = 1e5; + smoother_ord = 20; + + write_fine = false; + read_fine = false; + write_coarse = false; + read_coarse = false; + } +}; + +template +void runTest(const Options &opt){ + //Fine grids + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(opt.Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(opt.Ls,UGrid); + + //Setup G-parity BCs + assert(Nd == 4); + std::vector dirs4(4); + for(int i=0;i<3;i++) dirs4[i] = opt.GparityDirs[i]; + dirs4[3] = 0; //periodic gauge BC in time + + std::cout << GridLogMessage << "Gauge BCs: " << dirs4 << std::endl; + ConjugateGimplD::setDirections(dirs4); //gauge BC + + GparityWilsonImplD::ImplParams Params; + for(int i=0;i SchurOp(action); + + typedef GparityWilsonImplD::SiteSpinor SiteSpinor; + + const CPSLanczosParams &fine = opt.fine; + const CPSLanczosParams &coarse = opt.coarse; + + std::cout << GridLogMessage << "Keep " << fine.N_true_get << " fine vectors" << std::endl; + std::cout << GridLogMessage << "Keep " << coarse.N_true_get << " coarse vectors" << std::endl; + assert(coarse.N_true_get >= fine.N_true_get); + + assert(nbasis<=fine.N_true_get); + LocalCoherenceLanczosScidac _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,SchurOp,Odd); + std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl; + + //Compute and/or read fine evecs + if(opt.read_fine){ + _LocalCoherenceLanczos.checkpointFineRestore(opt.read_fine_file + "_evecs.scidac", opt.read_fine_file + "_evals.xml"); + }else{ + std::cout << GridLogMessage << "Performing fine grid IRL" << std::endl; + std::cout << GridLogMessage << "Using Chebyshev alpha=" << fine.alpha << " beta=" << fine.beta << " ord=" << fine.ch_ord << std::endl; + _LocalCoherenceLanczos.calcFine(fine.getChebyParams(), + fine.Nstop(),fine.Nk(),fine.Nm(), + fine.stop_rsd,fine.maxits,0,0); + if(opt.write_fine){ + std::cout << GridLogIRL<<"Checkpointing Fine evecs"< cheb_smoother(smoother); + + FermionField evec(FrbGrid); + FermionField evec_sm(FrbGrid); //smoothed + FermionField tmp(FrbGrid); + RealD eval; + + for(int i=0;i " << std::endl; + std::cout << GridLogMessage << " should have the format a.b.c where a,b,c are 0,1 depending on whether there are G-parity BCs in that direction" << std::endl; + std::cout << GridLogMessage << "Options:" << std::endl; + std::cout << GridLogMessage << "--Ls : Set Ls (default 12)" << std::endl; + std::cout << GridLogMessage << "--mass : Set the mass (default 0.01)" << std::endl; + std::cout << GridLogMessage << "--block : Set the block size. Format should be a.b.c.d.e where a-e are the block extents (default 2.2.2.2.2)" << std::endl; + std::cout << GridLogMessage << "--is_cps_cfg : Indicate that the configuration was generated with CPS where until recently the stored plaquette was wrong by a factor of 2" << std::endl; + std::cout << GridLogMessage << "--write_irl_templ: Write a template for the parameters file of the Lanczos to \"irl_templ.xml\"" << std::endl; + std::cout << GridLogMessage << "--read_irl_fine : Real the parameters file for the fine Lanczos" << std::endl; + std::cout << GridLogMessage << "--read_irl_coarse : Real the parameters file for the coarse Lanczos" << std::endl; + std::cout << GridLogMessage << "--write_fine : Write fine evecs/evals to filename starting with the stub" << std::endl; + std::cout << GridLogMessage << "--read_fine : Read fine evecs/evals from filename starting with the stub" << std::endl; + std::cout << GridLogMessage << "--write_coarse : Write coarse evecs/evals to filename starting with the stub" << std::endl; + std::cout << GridLogMessage << "--read_coarse : Read coarse evecs/evals from filename starting with the stub" << std::endl; + std::cout << GridLogMessage << "--smoother_ord : Set the Chebyshev order of the smoother (default 20)" << std::endl; + std::cout << GridLogMessage << "--coarse_relax_tol : Set the relaxation parameter for evaluating the residual of the reconstructed eigenvectors outside of the basis (default 1e5)" << std::endl; + std::cout << GridLogMessage << "--basis_size : Select the basis size from 100,200,300,350 (default 100)" << std::endl; + Grid_finalize(); + return 1; + } + opt.config = argv[1]; + GridCmdOptionIntVector(argv[2], opt.GparityDirs); + assert(opt.GparityDirs.size() == 3); + + for(int i=3;i> opt.mass; + std::cout << GridLogMessage << "Set quark mass to " << opt.mass << std::endl; + }else if(sarg == "--block"){ + GridCmdOptionIntVector(argv[i+1], opt.blockSize); + assert(opt.blockSize.size() == 5); + std::cout << GridLogMessage << "Set block size to "; + for(int q=0;q<5;q++) std::cout << opt.blockSize[q] << " "; + std::cout << std::endl; + }else if(sarg == "--is_cps_cfg"){ + opt.is_cps_cfg = true; + }else if(sarg == "--write_irl_templ"){ + XmlWriter writer("irl_templ.xml"); + write(writer,"Params", opt.fine); + Grid_finalize(); + return 0; + }else if(sarg == "--read_irl_fine"){ + std::cout << GridLogMessage << "Reading fine IRL params from " << argv[i+1] << std::endl; + XmlReader reader(argv[i+1]); + read(reader, "Params", opt.fine); + }else if(sarg == "--read_irl_coarse"){ + std::cout << GridLogMessage << "Reading coarse IRL params from " << argv[i+1] << std::endl; + XmlReader reader(argv[i+1]); + read(reader, "Params", opt.coarse); + }else if(sarg == "--write_fine"){ + opt.write_fine = true; + opt.write_fine_file = argv[i+1]; + }else if(sarg == "--read_fine"){ + opt.read_fine = true; + opt.read_fine_file = argv[i+1]; + }else if(sarg == "--write_coarse"){ + opt.write_coarse = true; + opt.write_coarse_file = argv[i+1]; + }else if(sarg == "--read_coarse"){ + opt.read_coarse = true; + opt.read_coarse_file = argv[i+1]; + }else if(sarg == "--smoother_ord"){ + std::istringstream ss(argv[i+1]); ss >> opt.smoother_ord; + std::cout << GridLogMessage << "Set smoother order to " << opt.smoother_ord << std::endl; + }else if(sarg == "--coarse_relax_tol"){ + std::istringstream ss(argv[i+1]); ss >> opt.coarse_relax_tol; + std::cout << GridLogMessage << "Set coarse IRL relaxation parameter to " << opt.coarse_relax_tol << std::endl; + }else if(sarg == "--mobius_scale"){ + std::istringstream ss(argv[i+1]); ss >> opt.mobius_scale; + std::cout << GridLogMessage << "Set Mobius scale to " << opt.mobius_scale << std::endl; + }else if(sarg == "--basis_size"){ + basis_size = std::stoi(argv[i+1]); + std::cout << GridLogMessage << "Set basis size to " << basis_size << std::endl; + } + } + + switch(basis_size){ + case 100: + runTest<100>(opt); break; + case 200: + runTest<200>(opt); break; + case 300: + runTest<300>(opt); break; + case 350: + runTest<350>(opt); break; + default: + std::cout << GridLogMessage << "Unsupported basis size " << basis_size << std::endl; + assert(0); + } + + Grid_finalize(); +} + diff --git a/tests/lanczos/Test_evec_compression.cc b/tests/lanczos/Test_evec_compression.cc new file mode 100644 index 00000000..1636ea3a --- /dev/null +++ b/tests/lanczos/Test_evec_compression.cc @@ -0,0 +1,582 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_evec_compression.cc + + Copyright (C) 2017 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +/* + * + * This test generates eigenvectors using the Lanczos algorithm then attempts to use local coherence compression + * to express those vectors in terms of a basis formed from a subset. This test is useful for finding the optimal + * blocking and basis size for performing a Local Coherence Lanczos + */ +#include +#include +#include + +using namespace std; +using namespace Grid; + +//For the CPS configurations we have to manually seed the RNG and deal with an incorrect factor of 2 in the plaquette metadata +template +void readConfiguration(LatticeGaugeFieldD &U, + const std::string &config, + bool is_cps_cfg = false){ + + if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = false; + + typedef GaugeStatistics GaugeStats; + + FieldMetaData header; + NerscIO::readConfiguration(U, header, config); + + if(is_cps_cfg) NerscIO::exitOnReadPlaquetteMismatch() = true; +} + +//Lanczos parameters in CPS conventions +struct CPSLanczosParams : Serializable { +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(CPSLanczosParams, + RealD, alpha, + RealD, beta, + int, ch_ord, + int, N_use, + int, N_get, + int, N_true_get, + RealD, stop_rsd, + int, maxits); + + //Translations + ChebyParams getChebyParams() const{ + ChebyParams out; + out.alpha = beta*beta; //aka lo + out.beta = alpha*alpha; //aka hi + out.Npoly = ch_ord+1; + return out; + } + int Nstop() const{ return N_true_get; } + int Nm() const{ return N_use; } + int Nk() const{ return N_get; } +}; + + +template +class LocalCoherenceCompressor{ +public: + typedef iVector CoarseSiteVector; + typedef Lattice CoarseScalar; // used for inner products on fine field + typedef Lattice CoarseField; + typedef Lattice FineField; + + void compress(std::vector &basis, + std::vector &compressed_evecs, + const std::vector &evecs_in, + GridBase *FineGrid, + GridBase *CoarseGrid){ + int nevecs = evecs_in.size(); + assert(nevecs > nbasis); + + //Construct the basis + basis.resize(nbasis, FineGrid); + for(int b=0;b &basis, const std::vector &compressed_evecs) const{ + blockPromote(compressed_evecs[i],evec,basis); + } + + //Test uncompressed eigenvectors of Linop.HermOp to precision 'base_tolerance' for i=nbasis + //Because the uncompressed evec has a lot of high mode noise (unimportant for deflation) we apply a smoother before testing. + //The Chebyshev used by the Lanczos should be sufficient as a smoother + bool testCompression(LinearOperatorBase &Linop, OperatorFunction &smoother, + const std::vector &basis, const std::vector &compressed_evecs, const std::vector &evals, + const RealD base_tolerance, const RealD relax){ + std::cout << GridLogMessage << "Testing quality of uncompressed evecs (after smoothing)" << std::endl; + + GridBase* FineGrid = basis[0].Grid(); + GridBase* CoarseGrid = compressed_evecs[0].Grid(); + + bool fail = false; + FineField evec(FineGrid), Mevec(FineGrid), evec_sm(FineGrid); + for(int i=0;i tol) fail = true; + } + return fail; + } + + //Compare uncompressed evecs to original evecs + void compareEvecs(const std::vector &basis, const std::vector &compressed_evecs, const std::vector &orig_evecs){ + std::cout << GridLogMessage << "Comparing uncompressed evecs to original evecs" << std::endl; + + GridBase* FineGrid = basis[0].Grid(); + GridBase* CoarseGrid = compressed_evecs[0].Grid(); + + FineField evec(FineGrid), diff(FineGrid); + for(int i=0;i +void compareBlockPromoteTimings(const std::vector > &basis, const std::vector > > &compressed_evecs){ + typedef iVector CoarseSiteVector; + typedef Lattice CoarseScalar; + typedef Lattice CoarseField; + typedef Lattice FineField; + + GridStopWatch timer; + + GridBase* FineGrid = basis[0].Grid(); + GridBase* CoarseGrid = compressed_evecs[0].Grid(); + + FineField v1(FineGrid), v2(FineGrid); + + //Start with a cold start + for(int i=0;i blockSize; + std::vector GparityDirs; + + bool write_fine; + std::string write_fine_file; + bool read_fine; + std::string read_fine_file; + + int basis_size; + + Args(){ + blockSize = {2,2,2,2,2}; + GparityDirs = {1,1,1}; //1 for each GP direction + + Ls = 12; + mass = 0.01; + M5 = 1.8; + is_cps_cfg = false; + mobius_scale = 2; + + fine.alpha = 2; + fine.beta = 0.1; + fine.ch_ord = 100; + fine.N_use = 70; + fine.N_get = 60; + fine.N_true_get = 60; + fine.stop_rsd = 1e-8; + fine.maxits = 10000; + + coarse_relax_tol = 1e5; + + write_fine = false; + read_fine = false; + + basis_size = 100; + } +}; + + +GparityWilsonImplD::ImplParams setupGparityParams(const std::vector &GparityDirs){ + //Setup G-parity BCs + assert(Nd == 4); + std::vector dirs4(4); + for(int i=0;i<3;i++) dirs4[i] = GparityDirs[i]; + dirs4[3] = 0; //periodic gauge BC in time + + std::cout << GridLogMessage << "Gauge BCs: " << dirs4 << std::endl; + ConjugateGimplD::setDirections(dirs4); //gauge BC + + GparityWilsonImplD::ImplParams Params; + for(int i=0;i +void run_b(ActionType &action, const std::string &config, const Args &args){ + //Fine grids + GridCartesian * UGrid = (GridCartesian*)action.GaugeGrid(); + GridRedBlackCartesian * UrbGrid = (GridRedBlackCartesian*)action.GaugeRedBlackGrid(); + GridCartesian * FGrid = (GridCartesian*)action.FermionGrid(); + GridRedBlackCartesian * FrbGrid = (GridRedBlackCartesian*)action.FermionRedBlackGrid(); + + //Setup the coarse grids + auto fineLatt = GridDefaultLatt(); + Coordinate coarseLatt(4); + for (int d=0;d<4;d++){ + coarseLatt[d] = fineLatt[d]/args.blockSize[d]; assert(coarseLatt[d]*args.blockSize[d]==fineLatt[d]); + } + + std::cout << GridLogMessage<< " 5d coarse lattice is "; + for (int i=0;i<4;i++){ + std::cout << coarseLatt[i]<<"x"; + } + int cLs = args.Ls/args.blockSize[4]; assert(cLs*args.blockSize[4]==args.Ls); + std::cout << cLs< CoarseSiteVector; + typedef Lattice CoarseScalar; + typedef Lattice CoarseField; + + typedef typename ActionType::FermionField FermionField; + + SchurDiagTwoOperator SchurOp(action); + + typedef typename ActionType::SiteSpinor SiteSpinor; + + const CPSLanczosParams &fine = args.fine; + + //Do the fine Lanczos + std::vector evals; + std::vector evecs; + + if(args.read_fine){ + evals.resize(fine.N_true_get); + evecs.resize(fine.N_true_get, FrbGrid); + + std::string evals_file = args.read_fine_file + "_evals.xml"; + std::string evecs_file = args.read_fine_file + "_evecs.scidac"; + + std::cout << GridLogIRL<< "Reading evals from "< Cheby(fine.getChebyParams()); + FunctionHermOp ChebyOp(Cheby,SchurOp); + PlainHermOp Op(SchurOp); + + evals.resize(Nm); + evecs.resize(Nm,FrbGrid); + + ImplicitlyRestartedLanczos IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,0,0); + + FermionField src(FrbGrid); + typedef typename FermionField::scalar_type Scalar; + src=Scalar(1.0); + src.Checkerboard() = Odd; + + int Nconv; + IRL.calc(evals, evecs,src,Nconv,false); + if(Nconv < Nstop) assert(0 && "Fine lanczos failed to converge the required number of evecs"); //algorithm doesn't consider this a failure + if(Nconv > Nstop){ + //Yes this potentially throws away some evecs but it is better than having a random number of evecs between Nstop and Nm! + evals.resize(Nstop); + evecs.resize(Nstop, FrbGrid); + } + + if(args.write_fine){ + std::string evals_file = args.write_fine_file + "_evals.xml"; + std::string evecs_file = args.write_fine_file + "_evecs.scidac"; + + std::cout << GridLogIRL<< "Writing evecs to "<IsBoss()); + WR.open(evecs_file); + for(int k=0;k compressor; + std::vector basis(nbasis,FrbGrid); + std::vector compressed_evecs(evecs.size(),CoarseGrid5); + + compressor.compress(basis, compressed_evecs, evecs, FrbGrid, CoarseGrid5); + + compareBlockPromoteTimings(basis, compressed_evecs); + + //Compare uncompressed and original evecs + compressor.compareEvecs(basis, compressed_evecs, evecs); + + //Create the smoother + Chebyshev smoother(fine.getChebyParams()); + + //Test the quality of the uncompressed evecs + assert( compressor.testCompression(SchurOp, smoother, basis, compressed_evecs, evals, fine.stop_rsd, args.coarse_relax_tol) ); +} + +template +void run(ActionType &action, const std::string &config, const Args &args){ + switch(args.basis_size){ + case 50: + return run_b<50>(action,config,args); + case 100: + return run_b<100>(action,config,args); + case 150: + return run_b<150>(action,config,args); + case 200: + return run_b<200>(action,config,args); + case 250: + return run_b<250>(action,config,args); + case 300: + return run_b<300>(action,config,args); + case 350: + return run_b<350>(action,config,args); + case 400: + return run_b<400>(action,config,args); + default: + assert(0 && "Unsupported basis size: allowed values are 50,100,200,250,300,350,400"); + } +} + + + + +//Note: because we rely upon physical properties we must use a "real" gauge configuration +int main (int argc, char ** argv) { + Grid_init(&argc,&argv); + GridLogIRL.TimingMode(1); + + if(argc < 3){ + std::cout << GridLogMessage << "Usage: " << std::endl; + std::cout << GridLogMessage << " should have the format a.b.c where a,b,c are 0,1 depending on whether there are G-parity BCs in that direction" << std::endl; + std::cout << GridLogMessage << "Options:" << std::endl; + std::cout << GridLogMessage << "--Ls : Set Ls (default 12)" << std::endl; + std::cout << GridLogMessage << "--mass : Set the mass (default 0.01)" << std::endl; + std::cout << GridLogMessage << "--block : Set the block size. Format should be a.b.c.d.e where a-e are the block extents (default 2.2.2.2.2)" << std::endl; + std::cout << GridLogMessage << "--is_cps_cfg : Indicate that the configuration was generated with CPS where until recently the stored plaquette was wrong by a factor of 2" << std::endl; + std::cout << GridLogMessage << "--write_irl_templ: Write a template for the parameters file of the Lanczos to \"irl_templ.xml\"" << std::endl; + std::cout << GridLogMessage << "--read_irl_fine : Real the parameters file for the fine Lanczos" << std::endl; + std::cout << GridLogMessage << "--write_fine : Write fine evecs/evals to filename starting with the stub" << std::endl; + std::cout << GridLogMessage << "--read_fine : Read fine evecs/evals from filename starting with the stub" << std::endl; + std::cout << GridLogMessage << "--coarse_relax_tol : Set the relaxation parameter for evaluating the residual of the reconstructed eigenvectors outside of the basis (default 1e5)" << std::endl; + std::cout << GridLogMessage << "--action : Set the action from 'DWF', 'Mobius' (default Mobius)" << std::endl; + std::cout << GridLogMessage << "--mobius_scale : Set the Mobius scale b+c (default 2)" << std::endl; + std::cout << GridLogMessage << "--basis_size : Set the basis size from 50,100,150,200,250,300,350,400 (default 100)" << std::endl; + + Grid_finalize(); + return 1; + } + std::string config = argv[1]; + + Args args; + GridCmdOptionIntVector(argv[2], args.GparityDirs); + assert(args.GparityDirs.size() == 3); + + std::string action_s = "Mobius"; + + for(int i=3;i> args.mass; + std::cout << GridLogMessage << "Set quark mass to " << args.mass << std::endl; + }else if(sarg == "--block"){ + GridCmdOptionIntVector(argv[i+1], args.blockSize); + assert(args.blockSize.size() == 5); + std::cout << GridLogMessage << "Set block size to "; + for(int q=0;q<5;q++) std::cout << args.blockSize[q] << " "; + std::cout << std::endl; + }else if(sarg == "--is_cps_cfg"){ + args.is_cps_cfg = true; + }else if(sarg == "--write_irl_templ"){ + XmlWriter writer("irl_templ.xml"); + write(writer,"Params",args.fine); + Grid_finalize(); + return 0; + }else if(sarg == "--read_irl_fine"){ + std::cout << GridLogMessage << "Reading fine IRL params from " << argv[i+1] << std::endl; + XmlReader reader(argv[i+1]); + read(reader, "Params", args.fine); + }else if(sarg == "--write_fine"){ + args.write_fine = true; + args.write_fine_file = argv[i+1]; + }else if(sarg == "--read_fine"){ + args.read_fine = true; + args.read_fine_file = argv[i+1]; + }else if(sarg == "--coarse_relax_tol"){ + std::istringstream ss(argv[i+1]); ss >> args.coarse_relax_tol; + std::cout << GridLogMessage << "Set coarse IRL relaxation parameter to " << args.coarse_relax_tol << std::endl; + }else if(sarg == "--action"){ + action_s = argv[i+1]; + std::cout << "Action set to " << action_s << std::endl; + }else if(sarg == "--mobius_scale"){ + std::istringstream ss(argv[i+1]); ss >> args.mobius_scale; + std::cout << GridLogMessage << "Set Mobius scale to " << args.mobius_scale << std::endl; + }else if(sarg == "--basis_size"){ + args.basis_size = std::stoi(argv[i+1]); + std::cout << GridLogMessage << "Set basis size to " << args.basis_size << std::endl; + } + } + + //Fine grids + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(args.Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(args.Ls,UGrid); + + LatticeGaugeField Umu(UGrid); + + bool is_gparity = false; + for(auto g : args.GparityDirs) if(g) is_gparity = true; + + double bmc = 1.; + double b = (args.mobius_scale + bmc)/2.; // b = 1/2 [ (b+c) + (b-c) ] + double c = (args.mobius_scale - bmc)/2.; // c = 1/2 [ (b+c) - (b-c) ] + + if(is_gparity){ + GparityWilsonImplD::ImplParams Params = setupGparityParams(args.GparityDirs); + readConfiguration(Umu, config, args.is_cps_cfg); //Read the gauge field + + if(action_s == "DWF"){ + GparityDomainWallFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, Params); + run(action, config, args); + }else if(action_s == "Mobius"){ + GparityMobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, b, c, Params); + run(action, config, args); + } + }else{ + WilsonImplD::ImplParams Params = setupParams(); + readConfiguration(Umu, config, args.is_cps_cfg); //Read the gauge field + + if(action_s == "DWF"){ + DomainWallFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, Params); + run(action, config, args); + }else if(action_s == "Mobius"){ + MobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, b, c, Params); + run(action, config, args); + } + } + + Grid_finalize(); +} From 3544965f5497a18c9e2c84f5ae9874e8b80adcce Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 7 Jul 2022 17:49:20 +0100 Subject: [PATCH 071/240] Stream doesn't work --- Grid/threads/Accelerator.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index d88c08b4..3be3bbe7 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, #define accelerator_barrier(dummy) \ { \ - cudaStreamSynchronize(cpuStream); \ + cudaDeviceSynchronize(); \ cudaError err = cudaGetLastError(); \ if ( cudaSuccess != err ) { \ printf("accelerator_barrier(): Cuda error %s \n", \ @@ -362,11 +362,11 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \ - 0,cpuStream, \ + 0,0/*cpuStream*/, \ num1,num2,nsimd, lambda); \ } else { \ hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ - 0,cpuStream, \ + 0,0/*cpuStream*/, \ num1,num2,nsimd, lambda); \ } \ } From c0f84824025c17a0bb97ece42833f671e3ce6770 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 7 Jul 2022 17:49:36 +0100 Subject: [PATCH 072/240] Remove SSC marks --- benchmarks/Benchmark_gparity.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/benchmarks/Benchmark_gparity.cc b/benchmarks/Benchmark_gparity.cc index ce84ecbc..c10e7849 100644 --- a/benchmarks/Benchmark_gparity.cc +++ b/benchmarks/Benchmark_gparity.cc @@ -98,9 +98,7 @@ int main (int argc, char ** argv) std::cout<Barrier(); @@ -141,9 +139,7 @@ int main (int argc, char ** argv) std::cout<Barrier(); From b0fe664e9d74b634ca880c56095c8f90a34ca6aa Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 10 Jul 2022 21:31:25 +0100 Subject: [PATCH 073/240] Better force log info --- Grid/qcd/action/ActionBase.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index fa69b4e5..b8c81f99 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -42,6 +42,8 @@ public: bool is_smeared = false; RealD deriv_norm_sum; RealD deriv_max_sum; + RealD Fdt_norm_sum; + RealD Fdt_max_sum; int deriv_num; RealD deriv_us; RealD S_us; @@ -51,12 +53,17 @@ public: deriv_num=0; deriv_norm_sum = deriv_max_sum=0.0; } - void deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;} - RealD deriv_max_average(void) { return deriv_max_sum/deriv_num; }; - RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; }; + void deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) { + deriv_max_sum+=max; deriv_norm_sum+=nrm; + Fdt_max_sum+=Fdt_max; Fdt_norm_sum+=Fdt_nrm; deriv_num++; + } + RealD deriv_max_average(void) { return deriv_max_sum/deriv_num; }; + RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; }; + RealD Fdt_max_average(void) { return Fdt_max_sum/deriv_num; }; + RealD Fdt_norm_average(void) { return Fdt_norm_sum/deriv_num; }; RealD deriv_timer(void) { return deriv_us; }; - RealD S_timer(void) { return deriv_us; }; - RealD refresh_timer(void) { return deriv_us; }; + RealD S_timer(void) { return S_us; }; + RealD refresh_timer(void) { return refresh_us; }; void deriv_timer_start(void) { deriv_us-=usecond(); } void deriv_timer_stop(void) { deriv_us+=usecond(); } void refresh_timer_start(void) { refresh_us-=usecond(); } From 1f907d330da8496d369877d7c93bd1379178acdc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 10 Jul 2022 21:31:48 +0100 Subject: [PATCH 074/240] Different default params for dirichlet --- Grid/qcd/action/ActionParams.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h index b203f27a..274ff318 100644 --- a/Grid/qcd/action/ActionParams.h +++ b/Grid/qcd/action/ActionParams.h @@ -39,7 +39,7 @@ struct GparityWilsonImplParams { Coordinate twists; //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs Coordinate dirichlet; // Blocksize of dirichlet BCs - GparityWilsonImplParams() : twists(Nd, 0), dirichlet(Nd, 0) {}; + GparityWilsonImplParams() : twists(Nd, 0) { dirichlet.resize(0); }; }; struct WilsonImplParams { @@ -48,13 +48,13 @@ struct WilsonImplParams { AcceleratorVector twist_n_2pi_L; AcceleratorVector boundary_phases; WilsonImplParams() { - dirichlet.resize(Nd,0); + dirichlet.resize(0); boundary_phases.resize(Nd, 1.0); twist_n_2pi_L.resize(Nd, 0.0); }; WilsonImplParams(const AcceleratorVector phi) : boundary_phases(phi), overlapCommsCompute(false) { twist_n_2pi_L.resize(Nd, 0.0); - dirichlet.resize(Nd,0); + dirichlet.resize(0); } }; @@ -62,7 +62,7 @@ struct StaggeredImplParams { Coordinate dirichlet; // Blocksize of dirichlet BCs StaggeredImplParams() { - dirichlet.resize(Nd,0); + dirichlet.resize(0); }; }; From 58182fe3455a0d78de6074e6c5e38bdd88e67751 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 10 Jul 2022 21:32:58 +0100 Subject: [PATCH 075/240] Different approach to default dirichlet params --- Grid/stencil/Stencil.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index eb73ba5f..9d005289 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -711,7 +711,9 @@ public: this->_comms_recv.resize(npoints); this->same_node.resize(npoints); - if ( p.dirichlet.size() ) DirichletBlock(p.dirichlet); // comms send/recv set up + if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); + + DirichletBlock(p.dirichlet); // comms send/recv set up _unified_buffer_size=0; surface_list.resize(0); From 177b1a7ec630bc02f661e2e10299b73b47e9281a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 10 Jul 2022 21:34:10 +0100 Subject: [PATCH 076/240] Mixed prec --- HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc | 13 +- HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc | 470 ++++++++++++++++++++++ 2 files changed, 480 insertions(+), 3 deletions(-) create mode 100644 HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc diff --git a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc index a9b5dc7e..13bda00b 100644 --- a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc +++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc @@ -128,8 +128,14 @@ template MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); +#if 1 + RealD delta=1.e-4; + std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); +#else std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); +#endif MPCG(src,psi); } }; @@ -161,7 +167,7 @@ int main(int argc, char **argv) { // MD.name = std::string("Force Gradient"); typedef GenericHMCRunner HMCWrapper; MD.name = std::string("MinimumNorm2"); - MD.MDsteps = 4; + MD.MDsteps = 6; MD.trajL = 1.0; HMCparameters HMCparams; @@ -204,7 +210,8 @@ int main(int argc, char **argv) { Real light_mass = 7.8e-4; Real strange_mass = 0.02132; Real pv_mass = 1.0; - std::vector hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + // std::vector hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + std::vector hasenbusch({ light_mass, 5e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // FIXME: // Same in MC and MD diff --git a/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc b/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc new file mode 100644 index 00000000..869f41f8 --- /dev/null +++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc @@ -0,0 +1,470 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_hmc_EODWFRatio.cc + +Copyright (C) 2015-2016 + +Author: Peter Boyle +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +template + class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + Integer TotalInnerIterations; //Number of inner CG iterations + Integer TotalOuterIterations; //Number of restarts + Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + + MixedPrecisionConjugateGradientOperatorFunction(RealD tol, + Integer maxinnerit, + Integer maxouterit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + InnerTolerance(tol), + MaxInnerIterations(maxinnerit), + MaxOuterIterations(maxouterit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5), + OuterLoopNormMult(100.) + { + /* Debugging instances of objects; references are stored + std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " < &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<(&LinOpU); + + // std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <_Mat)<_Mat)==&(LinOpD._Mat)); + + //////////////////////////////////////////////////////////////////////////////////// + // Must snarf a single precision copy of the gauge field in Linop_d argument + //////////////////////////////////////////////////////////////////////////////////// + typedef typename FermionOperatorF::GaugeField GaugeFieldF; + typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF; + typedef typename FermionOperatorD::GaugeField GaugeFieldD; + typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD; + + GridBase * GridPtrF = SinglePrecGrid4; + GridBase * GridPtrD = FermOpD.Umu.Grid(); + GaugeFieldF U_f (GridPtrF); + GaugeLinkFieldF Umu_f(GridPtrF); + // std::cout << " Dim gauge field "<Nd()<Nd()<(FermOpD.Umu, mu); + precisionChange(Umu_f,Umu_d); + PokeIndex(FermOpF.Umu, Umu_f, mu); + } + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// +#if 1 + RealD delta=1.e-4; + std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); +#else + std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); +#endif + MPCG(src,psi); + } + }; + +NAMESPACE_END(Grid); + + +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + int threads = GridThread::GetThreads(); + + // Typedefs to simplify notation + typedef WilsonImplR FermionImplPolicy; + typedef WilsonImplF FermionImplPolicyF; + + typedef MobiusFermionR FermionAction; + typedef MobiusFermionF FermionActionF; + typedef typename FermionAction::FermionField FermionField; + typedef typename FermionActionF::FermionField FermionFieldF; + + typedef Grid::XmlReader Serialiser; + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + IntegratorParameters MD; + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Leap Frog"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Force Gradient"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = 6; + MD.trajL = 1.0; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 1; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("ColdStart"); + HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_DDHMC_lat"; + CPparams.rng_prefix = "ckpoint_DDHMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + // Construct observables + // here there is too much indirection + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + Real beta = 2.31; + // Real light_mass = 5.4e-4; + Real light_mass = 7.8e-4; + Real strange_mass = 0.02132; + Real pv_mass = 1.0; + // std::vector hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + std::vector hasenbusch({ light_mass, 5e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + + // FIXME: + // Same in MC and MD + // Need to mix precision too + OneFlavourRationalParams SFRp; // Strange + SFRp.lo = 4.0e-3; + SFRp.hi = 90.0; + SFRp.MaxIter = 60000; + SFRp.tolerance= 1.0e-8; + SFRp.mdtolerance= 1.0e-6; + SFRp.degree = 12; + SFRp.precision= 50; + SFRp.BoundsCheckFreq=0; + + OneFlavourRationalParams OFRp; // Up/down + OFRp.lo = 2.0e-5; + OFRp.hi = 90.0; + OFRp.MaxIter = 60000; + OFRp.tolerance= 1.0e-8; + OFRp.mdtolerance= 1.0e-6; + // OFRp.degree = 20; converges + // OFRp.degree = 16; + OFRp.degree = 12; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG; + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + Coordinate CommDim(Nd); + for(int d=0;d1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + + Coordinate Block4(Nd); + Block4[0] = Dirichlet[1]; + Block4[1] = Dirichlet[2]; + Block4[2] = Dirichlet[3]; + Block4[3] = Dirichlet[4]; + + int Width=3; + TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4,Width)); + + ////////////////////////// + // Fermion Grids + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd()); + auto GridPtrF = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi); + auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF); + auto FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF); + auto FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + LatticeGaugeFieldF UF(GridPtrF); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + + + // These lines are unecessary if BC are all periodic + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + Params.dirichlet=NonDirichlet; + FermionAction::ImplParams ParamsDir(boundary); + ParamsDir.dirichlet=Dirichlet; + + // double StoppingCondition = 1e-14; + // double MDStoppingCondition = 1e-9; + double StoppingCondition = 1e-10; + double MDStoppingCondition = 1e-7; + double MDStoppingConditionLoose = 1e-6; + double MaxCGIterations = 300000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(4); + ActionLevel Level3(8); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir); + FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, ParamsDir); + + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp); + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp); + OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp); + Level1.push_back(&StrangePseudoFermionBdy); + Level2.push_back(&StrangePseudoFermionLocal); + Level1.push_back(&StrangePseudoFermionPVBdy); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + std::vector dirichlet_den; + std::vector dirichlet_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); dirichlet_den.push_back(0); + for(int h=0;h Numerators; + std::vector NumeratorsF; + std::vector Denominators; + std::vector DenominatorsF; + std::vector *> Quotients; + +#define MIXED_PRECISION +#ifdef MIXED_PRECISION + std::vector *> Bdys; +#else + std::vector *> Bdys; +#endif + std::vector ActionMPCG; + std::vector MPCG; + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + std::vector LinOpD; + std::vector LinOpF; + + for(int h=0;h(*Numerators[h],*Denominators[h],MDCG,CG)); + Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); + } else { +#ifdef MIXED_PRECISION + Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction( + *Numerators[h],*Denominators[h], + *NumeratorsF[h],*DenominatorsF[h], + OFRp, 500) ); +#else + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); +#endif + } + } + + int nquo=Quotients.size(); + Level1.push_back(Bdys[0]); + Level1.push_back(Bdys[1]); + for(int h=0;h Date: Sun, 10 Jul 2022 21:35:18 +0100 Subject: [PATCH 077/240] MixedPrec support --- .../OneFlavourEvenOddRationalRatio.h | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h index 1b36ae0f..078bf845 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h @@ -67,6 +67,36 @@ NAMESPACE_BEGIN(Grid); virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";} }; + template + class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction { + public: + typedef OneFlavourRationalParams Params; + private: + static RationalActionParams transcribe(const Params &in){ + RationalActionParams out; + out.inv_pow = 2; + out.lo = in.lo; + out.hi = in.hi; + out.MaxIter = in.MaxIter; + out.action_tolerance = out.md_tolerance = in.tolerance; + out.action_degree = out.md_degree = in.degree; + out.precision = in.precision; + out.BoundsCheckFreq = in.BoundsCheckFreq; + return out; + } + + public: + OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator &_NumOp, + FermionOperator &_DenOp, + FermionOperator &_NumOpF, + FermionOperator &_DenOpF, + const Params & p, Integer ReliableUpdateFreq + ) : + GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){} + + virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";} + }; + NAMESPACE_END(Grid); #endif From fab50c57d9d14be6cc0f32de93160c94eb563fec Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 11 Jul 2022 18:42:27 +0100 Subject: [PATCH 078/240] More loggin --- Grid/qcd/hmc/integrators/Integrator.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 5bdef14d..33a77f32 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -153,7 +153,7 @@ protected: Real force_max = std::sqrt(maxLocalNorm2(force)); Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR; - as[level].actions.at(a)->deriv_log(force_abs,force_max); + as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max); std::cout << GridLogIntegrator<< "["<deriv_max_average() <<" norm " << as[level].actions.at(actionID)->deriv_norm_average() + <<" Fdt max " << as[level].actions.at(actionID)->Fdt_max_average() + <<" norm " << as[level].actions.at(actionID)->Fdt_norm_average() <<" calls " << as[level].actions.at(actionID)->deriv_num << std::endl; } From f7217d12d2137c33c559ea33746ad9efab3b3bc3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 11 Jul 2022 13:45:31 -0400 Subject: [PATCH 079/240] World barrier for clock synch --- Grid/communicator/Communicator_base.h | 1 + Grid/communicator/Communicator_mpi3.cc | 4 ++++ Grid/communicator/Communicator_none.cc | 1 + 3 files changed, 6 insertions(+) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index d4f12f86..cb3b9f0e 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -107,6 +107,7 @@ public: //////////////////////////////////////////////////////////////////////////////// static int RankWorld(void) ; static void BroadcastWorld(int root,void* data, int bytes); + static void BarrierWorld(void); //////////////////////////////////////////////////////////// // Reduction diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 0d0a3443..b5f751b8 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -443,6 +443,10 @@ int CartesianCommunicator::RankWorld(void){ MPI_Comm_rank(communicator_world,&r); return r; } +void CartesianCommunicator::BarrierWorld(void){ + int ierr = MPI_Barrier(communicator_world); + assert(ierr==0); +} void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { int ierr= MPI_Bcast(data, diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index a0f33ca4..4b533c4b 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -104,6 +104,7 @@ int CartesianCommunicator::RankWorld(void){return 0;} void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } +void CartesianCommunicator::BarrierWorld(void) { } int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;} void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; } void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) From f73db8f1f3d44f2fde5bcb0eebcaccb708cdc485 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 11 Jul 2022 13:47:39 -0400 Subject: [PATCH 080/240] Synch clocks --- Grid/util/Init.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 36854d9c..c3ac2424 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -356,6 +356,11 @@ void Grid_init(int *argc,char ***argv) ////////////////////////////////////////////////////////// CartesianCommunicator::Init(argc,argv); + GridLogger::GlobalStopWatch.Stop(); + CartesianCommunicator::BarrierWorld(); + GridLogger::GlobalStopWatch.Reset();// Back to zero with synchronised clock + GridLogger::GlobalStopWatch.Start(); + //////////////////////////////////// // Banner after MPI (unless GPU) //////////////////////////////////// From ca4603580d9ccc761c665192c0f3aaeb412ba9e5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 11 Jul 2022 13:48:35 -0400 Subject: [PATCH 081/240] Verbose --- HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc index a9b5dc7e..c7a6ecd7 100644 --- a/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc +++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc @@ -141,6 +141,10 @@ int main(int argc, char **argv) { using namespace Grid; Grid_init(&argc, &argv); + + CartesianCommunicator::BarrierWorld(); + std::cout << GridLogMessage << " Clock skew check" < Date: Thu, 28 Jul 2022 11:37:02 -0400 Subject: [PATCH 082/240] Updated timing --- Grid/algorithms/iterative/ConjugateGradient.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index b65eea46..bcac6571 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -117,6 +117,7 @@ public: GridStopWatch MatrixTimer; GridStopWatch SolverTimer; + RealD usecs = -usecond(); SolverTimer.Start(); int k; for (k = 1; k <= MaxIterations; k++) { @@ -166,14 +167,16 @@ public: // Stopping condition if (cp <= rsq) { + usecs +=usecond(); SolverTimer.Stop(); Linop.HermOpAndNorm(psi, mmp, d, qq); p = mmp - src; - + GridBase *grid = src.Grid(); + RealD DwfFlops = (1452. )*grid->gSites()*4*k + + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra RealD srcnorm = std::sqrt(norm2(src)); RealD resnorm = std::sqrt(norm2(p)); RealD true_residual = resnorm / srcnorm; - std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << "\tComputed residual " << std::sqrt(cp / ssq) << "\tTrue residual " << true_residual @@ -187,6 +190,8 @@ public: std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() < Date: Thu, 28 Jul 2022 11:37:12 -0400 Subject: [PATCH 083/240] Better timing --- tests/Test_dwf_mixedcg_prec.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index da71f72d..1e6da515 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -95,26 +95,34 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; MixedPrecisionConjugateGradient mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); double t1,t2,flops; + double MdagMsiteflops = 1452; // Mobius (real coeffs) + // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of) + double CGsiteflops = (8+4+8+4+4)*Nc*Ns ; + std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops< CG(1.0e-8,10000); - for(int i=0;i<100;i++){ + for(int i=0;i<1;i++){ result_o_2 = Zero(); t1=usecond(); CG(HermOpEO,src_o,result_o_2); t2=usecond(); iters = CG.IterationsToComplete; - flops = 1320.0*2*FGrid->gSites()*iters; + flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; + flops+= CGsiteflops*FrbGrid->gSites()*iters; + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< Date: Thu, 28 Jul 2022 11:37:36 -0400 Subject: [PATCH 084/240] Better logging --- Grid/log/Log.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Grid/log/Log.h b/Grid/log/Log.h index 4d512ff6..fd706771 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -138,7 +138,8 @@ public: stream << std::setw(log.topWidth); } stream << log.topName << log.background()<< " : "; - stream << log.colour() << std::left; + // stream << log.colour() << std::left; + stream << std::left; if (log.chanWidth > 0) { stream << std::setw(log.chanWidth); @@ -153,9 +154,9 @@ public: stream << log.evidence() << now << log.background() << " : " ; } - stream << log.colour(); + // stream << log.colour(); + stream << std::right; stream.flags(f); - return stream; } else { return devnull; @@ -180,6 +181,7 @@ extern GridLogger GridLogWarning; extern GridLogger GridLogMessage; extern GridLogger GridLogDebug ; extern GridLogger GridLogPerformance; +extern GridLogger GridLogDslash; extern GridLogger GridLogIterative ; extern GridLogger GridLogIntegrator ; extern GridLogger GridLogHMC; From a913b8be12726c5b5c8c831a13c82fedc78c3fc3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 28 Jul 2022 11:37:55 -0400 Subject: [PATCH 085/240] Dslash self timing. Might want to not have this --- Grid/log/Log.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/log/Log.cc b/Grid/log/Log.cc index 63bc454f..acccec0e 100644 --- a/Grid/log/Log.cc +++ b/Grid/log/Log.cc @@ -68,6 +68,7 @@ GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL"); GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL"); GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); +GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE"); @@ -80,6 +81,7 @@ void GridLogConfigure(std::vector &logstreams) { GridLogIterative.Active(0); GridLogDebug.Active(0); GridLogPerformance.Active(0); + GridLogDslash.Active(0); GridLogIntegrator.Active(1); GridLogColours.Active(0); GridLogHMC.Active(1); @@ -91,6 +93,7 @@ void GridLogConfigure(std::vector &logstreams) { if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); + if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); if (logstreams[i] == std::string("NoIntegrator")) GridLogIntegrator.Active(0); if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); From 486409574ea8a6a50bf6f18400c1ae90dd489c2d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 28 Jul 2022 11:38:34 -0400 Subject: [PATCH 086/240] Expanded cach to avoid any allocs in HMC --- Grid/allocator/MemoryManager.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index ef02f6aa..d055898f 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -40,7 +40,7 @@ void MemoryManager::PrintBytes(void) ////////////////////////////////////////////////////////////////////// MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; int MemoryManager::Victim[MemoryManager::NallocType]; -int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 }; +int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; ////////////////////////////////////////////////////////////////////// // Actual allocation and deallocation utils From 60e63dca1dbb6a3fa2064b42972353e6f0a1f575 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 28 Jul 2022 11:39:15 -0400 Subject: [PATCH 087/240] Add memory logging channel --- Grid/allocator/MemoryManagerCache.cc | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 04b3fe95..f6d07582 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -3,8 +3,14 @@ #warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); -//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout); -#define dprintf(...) + +#define MAXLINE 512 +static char print_buffer [ MAXLINE ]; + +#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; +//#define dprintf(...) printf (__VA_ARGS__ ); fflush(stdout); +#define dprintf(...) + //////////////////////////////////////////////////////////// @@ -104,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + mprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); @@ -126,7 +132,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + mprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); if(AccCache.state==AccDirty) { @@ -150,7 +156,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -165,7 +171,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; From 8137cc704914f48321d5f9497492cdbe1dfd2f1f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 28 Jul 2022 12:01:51 -0400 Subject: [PATCH 088/240] Allways concurrent comms --- Grid/communicator/Communicator_mpi3.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index b5f751b8..5c009890 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -396,16 +396,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorStencilSendToRecvFromComplete(list,dir); - list.resize(0); - } - + /* if ( CommunicatorPolicy == CommunicatorPolicySequential ) { + * this->StencilSendToRecvFromComplete(list,dir); + * list.resize(0); + * } + */ return off_node_bytes; } void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) { - // std::cout << "Copy Synchronised\n"< Date: Thu, 28 Jul 2022 12:13:35 -0400 Subject: [PATCH 089/240] High res timer instead of gettimeofday --- Grid/perfmon/PerfCount.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/perfmon/PerfCount.cc b/Grid/perfmon/PerfCount.cc index 2062bb59..114c36a0 100644 --- a/Grid/perfmon/PerfCount.cc +++ b/Grid/perfmon/PerfCount.cc @@ -27,10 +27,13 @@ Author: paboyle /* END LEGAL */ #include -#include +#include +#include NAMESPACE_BEGIN(Grid); +GridTimePoint theProgramStart = GridClock::now(); + #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16)) #define RawConfig(A,B) (A<<8|B) const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = { From 9c21add0c6a54ff68efc63dfd354a225464e8035 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 28 Jul 2022 12:14:03 -0400 Subject: [PATCH 090/240] High res timer replaces getttimeofday --- Grid/perfmon/Timer.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Grid/perfmon/Timer.h b/Grid/perfmon/Timer.h index 02c23a62..ba5df85a 100644 --- a/Grid/perfmon/Timer.h +++ b/Grid/perfmon/Timer.h @@ -35,17 +35,8 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid) -// Dress the output; use std::chrono -// C++11 time facilities better? -inline double usecond(void) { - struct timeval tv; - tv.tv_sec = 0; - tv.tv_usec = 0; - gettimeofday(&tv,NULL); - return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec; -} - -typedef std::chrono::system_clock GridClock; +//typedef std::chrono::system_clock GridClock; +typedef std::chrono::high_resolution_clock GridClock; typedef std::chrono::time_point GridTimePoint; typedef std::chrono::seconds GridSecs; @@ -53,6 +44,15 @@ typedef std::chrono::milliseconds GridMillisecs; typedef std::chrono::microseconds GridUsecs; typedef std::chrono::microseconds GridTime; +extern GridTimePoint theProgramStart; +// Dress the output; use std::chrono +// C++11 time facilities better? +inline double usecond(void) { + auto usecs = std::chrono::duration_cast(GridClock::now()-theProgramStart); + return 1.0*usecs.count(); +} + + inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time) { stream << time.count()<<" s"; From a93d5459d4c057b6f8a91137d4d6b59b5477dd10 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 28 Jul 2022 12:18:35 -0400 Subject: [PATCH 091/240] Better mpi request completion --- Grid/stencil/Stencil.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 9d005289..76d7eb77 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -290,6 +290,8 @@ public: std::vector DecompressionsSHM; std::vector CopyReceiveBuffers ; std::vector CachedTransfers; + std::vector MpiReqs; + /////////////////////////////////////////////////////////// // Unified Comms buffers for all directions /////////////////////////////////////////////////////////// @@ -357,9 +359,8 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { - reqs.resize(Packets.size()); for(int i=0;iStencilSendToRecvFromBegin(reqs[i], + _grid->StencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, Packets[i].to_rank,Packets[i].do_send, Packets[i].recv_buf, @@ -370,9 +371,7 @@ public: void CommunicateComplete(std::vector > &reqs) { - for(int i=0;iStencilSendToRecvFromComplete(reqs[i],i); - } + _grid->StencilSendToRecvFromComplete(MpiReqs,i); } //////////////////////////////////////////////////////////////////////// // Blocking send and receive. Either sequential or parallel. @@ -499,6 +498,7 @@ public: Packets.resize(0); CopyReceiveBuffers.resize(0); CachedTransfers.resize(0); + MpiReqs.resize(0); } void AddCopy(void *from,void * to, Integer bytes) { @@ -795,7 +795,6 @@ public: u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); } - PrecomputeByteOffsets(); } @@ -1107,7 +1106,6 @@ public: // Gather locally //////////////////////////////////////////////////////// assert(send_buf!=NULL); - Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so); } @@ -1214,8 +1212,9 @@ public: face_table[face_idx].size()*sizeof(face_table_host[0])); } - if ( comms_send || comms_recv ) + if ( comms_send || comms_recv ) { Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); + } face_idx++; //spointers[0] -- low From 74f10c2dc0681e00f484ef193f5bcc79c8c7bfc3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 4 Aug 2022 13:34:11 -0400 Subject: [PATCH 092/240] Move barrier into Stencil Send --- Grid/qcd/action/fermion/WilsonCompressor.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index eba04abf..324283df 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -400,7 +400,6 @@ public: } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); - accelerator_barrier(); } }; From 75bb6b2b4023b56a8ed08c03d2c31fc2944d5883 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 4 Aug 2022 13:35:26 -0400 Subject: [PATCH 093/240] Move barrier into the StencilSend begin routine --- Grid/stencil/Stencil.h | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 76d7eb77..5b89539e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -359,6 +359,7 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + accelerator_barrier(); for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -371,39 +372,19 @@ public: void CommunicateComplete(std::vector > &reqs) { - _grid->StencilSendToRecvFromComplete(MpiReqs,i); + _grid->StencilSendToRecvFromComplete(MpiReqs,0); } //////////////////////////////////////////////////////////////////////// // Blocking send and receive. Either sequential or parallel. //////////////////////////////////////////////////////////////////////// void Communicate(void) { - if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){ - ///////////////////////////////////////////////////////// - // several way threaded on different communicators. - // Cannot combine with Dirichlet operators - // This scheme is needed on Intel Omnipath for best performance - // Deprecate once there are very few omnipath clusters - ///////////////////////////////////////////////////////// - int nthreads = CartesianCommunicator::nCommThreads; - int old = GridThread::GetThreads(); - GridThread::SetThreads(nthreads); - thread_for(i,Packets.size(),{ - _grid->StencilSendToRecvFrom(Packets[i].send_buf, - Packets[i].to_rank,Packets[i].do_send, - Packets[i].recv_buf, - Packets[i].from_rank,Packets[i].do_recv, - Packets[i].bytes,i); - }); - GridThread::SetThreads(old); - } else { - ///////////////////////////////////////////////////////// - // Concurrent and non-threaded asynch calls to MPI - ///////////////////////////////////////////////////////// - std::vector > reqs; - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); - } + ///////////////////////////////////////////////////////// + // Concurrent and non-threaded asynch calls to MPI + ///////////////////////////////////////////////////////// + std::vector > reqs; + this->CommunicateBegin(reqs); + this->CommunicateComplete(reqs); } template void HaloExchange(const Lattice &source,compressor &compress) @@ -483,7 +464,6 @@ public: face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); - accelerator_barrier(); } ///////////////////////// From 06d9ce1a021ba4d43dcd5693fce769f97be8db13 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 4 Aug 2022 13:35:56 -0400 Subject: [PATCH 094/240] Synch ranks on node here for GPU - GPU memcopy --- Grid/communicator/Communicator_mpi3.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 5c009890..fef4ea1f 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -406,6 +406,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list,int dir) { acceleratorCopySynchronise(); + StencilBarrier();// Synch shared memory on a single nodes int nreq=list.size(); From 7ba478871576481f79a41fda3fd0d95c6c62ce4a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 4 Aug 2022 13:36:44 -0400 Subject: [PATCH 095/240] Fix --- HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc b/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc index 869f41f8..732c4666 100644 --- a/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc +++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc @@ -332,9 +332,9 @@ int main(int argc, char **argv) { OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp); OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp); OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp); - Level1.push_back(&StrangePseudoFermionBdy); + Level1.push_back(&StrangePseudoFermionBdy); // ok Level2.push_back(&StrangePseudoFermionLocal); - Level1.push_back(&StrangePseudoFermionPVBdy); + Level1.push_back(&StrangePseudoFermionPVBdy); //ok //////////////////////////////////// // up down action @@ -436,6 +436,10 @@ int main(int argc, char **argv) { *Numerators[h],*Denominators[h], *NumeratorsF[h],*DenominatorsF[h], OFRp, 500) ); + Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction( + *Numerators[h],*Denominators[h], + *NumeratorsF[h],*DenominatorsF[h], + OFRp, 500) ); #else Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); From de40395773a9b5c81be588881ad8c4cbaee98295 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 4 Aug 2022 13:37:16 -0400 Subject: [PATCH 096/240] More timing. Think I should start to use nvtx and rocmtx ?? --- .../WilsonFermion5DImplementation.h | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 51c7df57..44c69583 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -233,10 +233,10 @@ void WilsonFermion5D::ImportGauge(const GaugeField &_Umu) GaugeField HUmu(_Umu.Grid()); HUmu = _Umu*(-0.5); if ( Dirichlet ) { - std::cout << GridLogMessage << " Dirichlet BCs 5d " < Filter(GaugeBlock); Filter.applyFilter(HUmu); } @@ -382,12 +382,14 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { - DhopTotalTime-=usecond(); + // std::cout << GridLogDslash<<"Dhop internal"<::DhopInternalOverlappedComms(StencilImpl & st, Lebesg ///////////////////////////// // Start comms // Gather intranode and extra node differentiated?? ///////////////////////////// - DhopFaceTime-=usecond(); + DhopFaceTime=-usecond(); st.HaloExchangeOptGather(in,compressor); DhopFaceTime+=usecond(); + // std::cout << GridLogDslash<< " Dhop Gather end "<< DhopFaceTime<<" us " < > requests; st.CommunicateBegin(requests); ///////////////////////////// // Overlap with comms ///////////////////////////// - DhopFaceTime-=usecond(); + DhopFaceTime=-usecond(); st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond(); + // std::cout << GridLogDslash<< " Dhop Commsmerge end "<::DhopInternalSerialComms(StencilImpl & st, LebesgueOr Compressor compressor(dag); int LLs = in.Grid()->_rdimensions[0]; - - DhopCommTime-=usecond(); + + // std::cout << GridLogDslash<< " Dhop Halo exchange begine " <::DhopInternalSerialComms(StencilImpl & st, LebesgueOr Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); } DhopComputeTime+=usecond(); + // std::cout << GridLogDslash<< " Dhop Compute end "< Date: Thu, 4 Aug 2022 15:23:13 -0400 Subject: [PATCH 097/240] Simplify dead code --- .../WilsonKernelsImplementation.h | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 623da5cf..211edb6c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -416,19 +416,6 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S #undef LoopBody } -#define KERNEL_CALL_TMP(A) \ - const uint64_t NN = Nsite*Ls; \ - auto U_p = & U_v[0]; \ - auto in_p = & in_v[0]; \ - auto out_p = & out_v[0]; \ - auto st_p = st_v._entries_p; \ - auto st_perm = st_v._permute_type; \ - accelerator_forNB( ss, NN, Simd::Nsimd(), { \ - int sF = ss; \ - int sU = ss/Ls; \ - WilsonKernels::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p); \ - }); \ - accelerator_barrier(); #define KERNEL_CALLNB(A) \ const uint64_t NN = Nsite*Ls; \ @@ -448,8 +435,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S int sF = ptr[ss]; \ int sU = ss/Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ - }); \ - accelerator_barrier(); + }); #define ASM_CALL(A) \ thread_for( ss, Nsite, { \ @@ -471,7 +457,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} #ifdef SYCL_HACK - if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; } + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; } #else if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} #endif From 2cb5bedc15ce11ec0db889fb871b4ced11040734 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 4 Aug 2022 15:24:03 -0400 Subject: [PATCH 098/240] Copy stream HIP improvements --- Grid/threads/Accelerator.cc | 18 +++++++++++------- Grid/threads/Accelerator.h | 7 ++++--- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 092d46b3..7dfbc4ff 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -1,6 +1,7 @@ #include NAMESPACE_BEGIN(Grid); +int world_rank; // Use to control world rank for print guarding int acceleratorAbortOnGpuError=1; uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; @@ -16,7 +17,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; #ifdef GRID_CUDA cudaDeviceProp *gpu_props; cudaStream_t copyStream; -cudaStream_t cpuStream; +cudaStream_t computeStream; void acceleratorInit(void) { int nDevices = 1; @@ -24,7 +25,8 @@ void acceleratorInit(void) gpu_props = new cudaDeviceProp[nDevices]; char * localRankStr = NULL; - int rank = 0, world_rank=0; + int rank = 0; + world_rank=0; if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);} @@ -99,7 +101,7 @@ void acceleratorInit(void) cudaSetDevice(device); cudaStreamCreate(©Stream); - cudaStreamCreate(&cpuStream); + cudaStreamCreate(&computeStream); const int len=64; char busid[len]; if( rank == world_rank ) { @@ -114,7 +116,7 @@ void acceleratorInit(void) #ifdef GRID_HIP hipDeviceProp_t *gpu_props; hipStream_t copyStream; -hipStream_t cpuStream; +hipStream_t computeStream; void acceleratorInit(void) { int nDevices = 1; @@ -122,7 +124,8 @@ void acceleratorInit(void) gpu_props = new hipDeviceProp_t[nDevices]; char * localRankStr = NULL; - int rank = 0, world_rank=0; + int rank = 0; + world_rank=0; // We extract the local rank initialization using an environment variable if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { @@ -183,7 +186,7 @@ void acceleratorInit(void) #endif hipSetDevice(device); hipStreamCreate(©Stream); - hipStreamCreate(&cpuStream); + hipStreamCreate(&computeStream); const int len=64; char busid[len]; if( rank == world_rank ) { @@ -210,7 +213,8 @@ void acceleratorInit(void) #endif char * localRankStr = NULL; - int rank = 0, world_rank=0; + int rank = 0; + world_rank=0; // We extract the local rank initialization using an environment variable if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 3be3bbe7..bd09a880 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -370,7 +370,8 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { num1,num2,nsimd, lambda); \ } \ } - +// Works with MPI if barrier here +// accelerator_barrier(); template __global__ __launch_bounds__(64,1) @@ -400,7 +401,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) #define accelerator_barrier(dummy) \ { \ - hipStreamSynchronize(cpuStream); \ + hipDeviceSynchronize(); \ auto err = hipGetLastError(); \ if ( err != hipSuccess ) { \ printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ @@ -443,7 +444,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch { - hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice); + hipMemcpyDtoDAsync(to,from,bytes, copyStream); } inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); }; From 95b640cb6ba6b3211554f799d2ede5744ae37403 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 4 Aug 2022 15:43:52 -0400 Subject: [PATCH 099/240] 10TF/s on 32^3 x 64 on single node --- Grid/threads/Accelerator.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index bd09a880..4e476abb 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -107,7 +107,7 @@ void acceleratorInit(void); extern int acceleratorAbortOnGpuError; extern cudaStream_t copyStream; -extern cudaStream_t cpuStream; +extern cudaStream_t computeStream; accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT @@ -135,7 +135,7 @@ inline void cuda_mem(void) }; \ dim3 cu_threads(nsimd,acceleratorThreads(),1); \ dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - LambdaApply<<>>(num1,num2,nsimd,lambda); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ } #define accelerator_for6dNB(iter1, num1, \ @@ -154,7 +154,7 @@ inline void cuda_mem(void) }; \ dim3 cu_blocks (num1,num2,num3); \ dim3 cu_threads(num4,num5,num6); \ - Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ + Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ } template __global__ @@ -190,7 +190,7 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, #define accelerator_barrier(dummy) \ { \ - cudaDeviceSynchronize(); \ + cudaStreamSynchronize(computeStream); \ cudaError err = cudaGetLastError(); \ if ( cudaSuccess != err ) { \ printf("accelerator_barrier(): Cuda error %s \n", \ @@ -340,7 +340,7 @@ NAMESPACE_BEGIN(Grid); #define accelerator_inline __host__ __device__ inline extern hipStream_t copyStream; -extern hipStream_t cpuStream; +extern hipStream_t computeStream; /*These routines define mapping from thread grid to loop & vector lane indexing */ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT @@ -362,16 +362,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \ - 0,0/*cpuStream*/, \ + 0,computeStream, \ num1,num2,nsimd, lambda); \ } else { \ hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ - 0,0/*cpuStream*/, \ + 0,computeStream, \ num1,num2,nsimd, lambda); \ } \ } -// Works with MPI if barrier here -// accelerator_barrier(); template __global__ __launch_bounds__(64,1) @@ -401,7 +399,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) #define accelerator_barrier(dummy) \ { \ - hipDeviceSynchronize(); \ + hipStreamSynchronize(computeStream); \ auto err = hipGetLastError(); \ if ( err != hipSuccess ) { \ printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ From 4dc3d6fce05559c7db5f5d1f0c45aafc4eed6155 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 16:53:19 -0400 Subject: [PATCH 100/240] Buy into Nvidia/Rocm etc... tracing. --- Grid/GridCore.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/GridCore.h b/Grid/GridCore.h index 2209f960..8e04a859 100644 --- a/Grid/GridCore.h +++ b/Grid/GridCore.h @@ -44,7 +44,8 @@ Author: paboyle #include #include #include -#include +#include +//#include #include #include #include From 659fac9dfb720528ab7e97770cb8a5053187daa5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 16:54:25 -0400 Subject: [PATCH 101/240] Tracing hook --- Grid/algorithms/iterative/ConjugateGradient.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index bcac6571..e0fa7f83 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -58,6 +58,7 @@ public: void operator()(LinearOperatorBase &Linop, const Field &src, Field &psi) { + GRID_TRACE("ConjugateGradient"); psi.Checkerboard() = src.Checkerboard(); conformable(psi, src); From ca40a1b00b2ff29e6084fc86717a51c6b42b878d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 16:54:55 -0400 Subject: [PATCH 102/240] Tracing --- Grid/algorithms/iterative/ConjugateGradientMultiShift.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h index 0e70916f..9d614404 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h @@ -84,6 +84,7 @@ public: void operator() (LinearOperatorBase &Linop, const Field &src, std::vector &psi) { + GRID_TRACE("ConjugateGradientMultiShift"); GridBase *grid = src.Grid(); From 5752538661aa7aa29be9277e4229a5a581778a7b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 16:57:32 -0400 Subject: [PATCH 103/240] Tracing --- Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h index 1470a229..b1d90688 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h @@ -127,6 +127,7 @@ public: void operator() (LinearOperatorBase &Linop_d, const FieldD &src_d, std::vector &psi_d) { + GRID_TRACE("ConjugateGradientMultiShiftMixedPrec"); GridBase *DoublePrecGrid = src_d.Grid(); //////////////////////////////////////////////////////////////////////// From 19cc7653fb2b7def35774a11f56c38e0ea5b35a0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 16:57:51 -0400 Subject: [PATCH 104/240] Tracing --- Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h index ab12a41f..094f847d 100644 --- a/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h +++ b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h @@ -73,6 +73,7 @@ public: } void operator()(const FieldD &src, FieldD &psi) { + GRID_TRACE("ConjugateGradientReliableUpdate"); LinearOperatorBase *Linop_f_use = &Linop_f; bool using_fallback = false; From 9295ed8d208ca4922ab4a3128e6eff8c62db74df Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 16:59:51 -0400 Subject: [PATCH 105/240] Print full memory range --- Grid/communicator/SharedMemoryMPI.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index fe2f2d89..4993a02e 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -523,7 +523,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) } if ( WorldRank == 0 ){ std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes - << "bytes at "<< std::hex<< ShmCommBuf < Date: Wed, 31 Aug 2022 17:00:25 -0400 Subject: [PATCH 106/240] Tracing --- Grid/lattice/Lattice_arith.h | 16 ++++++++++++++++ Grid/lattice/Lattice_base.h | 3 +++ Grid/lattice/Lattice_reduction.h | 8 ++++++++ 3 files changed, 27 insertions(+) diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index b39a475d..aebc093a 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////////////////////////// template inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + GRID_TRACE("mult"); ret.Checkerboard() = lhs.Checkerboard(); autoView( ret_v , ret, AcceleratorWrite); autoView( lhs_v , lhs, AcceleratorRead); @@ -53,6 +54,7 @@ void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ template inline void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + GRID_TRACE("mac"); ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); @@ -70,6 +72,7 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ template inline void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + GRID_TRACE("sub"); ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); @@ -86,6 +89,7 @@ void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ } template inline void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ + GRID_TRACE("add"); ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); @@ -106,6 +110,7 @@ void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ////////////////////////////////////////////////////////////////////////////////////////////////////// template inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + GRID_TRACE("mult"); ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); autoView( ret_v , ret, AcceleratorWrite); @@ -119,6 +124,7 @@ void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ template inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + GRID_TRACE("mac"); ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); autoView( ret_v , ret, AcceleratorWrite); @@ -133,6 +139,7 @@ void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ template inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + GRID_TRACE("sub"); ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); autoView( ret_v , ret, AcceleratorWrite); @@ -146,6 +153,7 @@ void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ } template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + GRID_TRACE("add"); ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); autoView( ret_v , ret, AcceleratorWrite); @@ -163,6 +171,7 @@ void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ////////////////////////////////////////////////////////////////////////////////////////////////////// template inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + GRID_TRACE("mult"); ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); autoView( ret_v , ret, AcceleratorWrite); @@ -177,6 +186,7 @@ void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ template inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + GRID_TRACE("mac"); ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); autoView( ret_v , ret, AcceleratorWrite); @@ -191,6 +201,7 @@ void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ template inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + GRID_TRACE("sub"); ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); autoView( ret_v , ret, AcceleratorWrite); @@ -204,6 +215,7 @@ void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ } template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + GRID_TRACE("add"); ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); autoView( ret_v , ret, AcceleratorWrite); @@ -218,6 +230,7 @@ void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ template inline void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ + GRID_TRACE("axpy"); ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); @@ -231,6 +244,7 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & } template inline void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ + GRID_TRACE("axpby"); ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); @@ -246,11 +260,13 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice template inline RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y) { + GRID_TRACE("axpy_norm"); return axpy_norm_fast(ret,a,x,y); } template inline RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y) { + GRID_TRACE("axpby_norm"); return axpby_norm_fast(ret,a,b,x,y); } diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 9c3d723f..49c0a100 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -117,6 +117,7 @@ public: //////////////////////////////////////////////////////////////////////////////// template inline Lattice & operator=(const LatticeUnaryExpression &expr) { + GRID_TRACE("ExpressionTemplateEval"); GridBase *egrid(nullptr); GridFromExpression(egrid,expr); assert(egrid!=nullptr); @@ -140,6 +141,7 @@ public: } template inline Lattice & operator=(const LatticeBinaryExpression &expr) { + GRID_TRACE("ExpressionTemplateEval"); GridBase *egrid(nullptr); GridFromExpression(egrid,expr); assert(egrid!=nullptr); @@ -163,6 +165,7 @@ public: } template inline Lattice & operator=(const LatticeTrinaryExpression &expr) { + GRID_TRACE("ExpressionTemplateEval"); GridBase *egrid(nullptr); GridFromExpression(egrid,expr); assert(egrid!=nullptr); diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 16feb856..bcd09c04 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -488,6 +488,14 @@ template inline void sliceSum(const Lattice &Data,std::vector< int words = fd*sizeof(sobj)/sizeof(scalar_type); grid->GlobalSumVector(ptr, words); } +template inline +std::vector +sliceSum(const Lattice &Data,int orthogdim) +{ + std::vector result; + sliceSum(Data,result,orthogdim); + return result; +} template static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) From a34a6e059f537358fd3274aeecb5831dcc174e73 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:06:08 -0400 Subject: [PATCH 107/240] Logging improvement. Sinitial will be used to improve RHMC terms --- Grid/qcd/action/ActionBase.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index b8c81f99..353c4d3b 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -52,10 +52,13 @@ public: deriv_us = S_us = refresh_us = 0.0; deriv_num=0; deriv_norm_sum = deriv_max_sum=0.0; + Fdt_max_sum = Fdt_norm_sum = 0.0; } void deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) { - deriv_max_sum+=max; deriv_norm_sum+=nrm; - Fdt_max_sum+=Fdt_max; Fdt_norm_sum+=Fdt_nrm; deriv_num++; + deriv_max_sum+=max; + deriv_norm_sum+=nrm; + Fdt_max_sum+=Fdt_max; + Fdt_norm_sum+=Fdt_nrm; deriv_num++; } RealD deriv_max_average(void) { return deriv_max_sum/deriv_num; }; RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; }; @@ -73,6 +76,7 @@ public: // Heatbath? virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions virtual RealD S(const GaugeField& U) = 0; // evaluate the action + virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ? virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative virtual std::string action_name() = 0; // return the action name virtual std::string LogParameters() = 0; // prints action parameters From b3f33f82f764b09d39f569acb3be4902533ad8ef Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:06:47 -0400 Subject: [PATCH 108/240] Decrease self timing hooks, use nvtx / roctx type tracing hooks instead --- Grid/qcd/action/fermion/CayleyFermion5D.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index 1ce4012e..cf39ec99 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -183,16 +183,6 @@ public: GridRedBlackCartesian &FourDimRedBlackGrid, RealD _mass,RealD _M5,const ImplParams &p= ImplParams()); - void CayleyReport(void); - void CayleyZeroCounters(void); - - double M5Dflops; - double M5Dcalls; - double M5Dtime; - - double MooeeInvFlops; - double MooeeInvCalls; - double MooeeInvTime; protected: virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); From f991ad7d5c360f04a377a5718ddca8833652085a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:08:18 -0400 Subject: [PATCH 109/240] Remove timing hooks as tracing replaces --- Grid/qcd/action/fermion/ImprovedStaggeredFermion.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h index ecf44ed7..60cfc727 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -47,18 +47,6 @@ public: FermionField _tmp; FermionField &tmp(void) { return _tmp; } - //////////////////////////////////////// - // Performance monitoring - //////////////////////////////////////// - void Report(void); - void ZeroCounters(void); - double DhopTotalTime; - double DhopCalls; - double DhopCommTime; - double DhopComputeTime; - double DhopComputeTime2; - double DhopFaceTime; - /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// From 730be89abff0cecbc3a5daf72a4d6ddbd70b694c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:08:44 -0400 Subject: [PATCH 110/240] Remove timing hooks as tracing replaces --- Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index ca660610..5b26b35c 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -52,18 +52,6 @@ public: FermionField _tmp; FermionField &tmp(void) { return _tmp; } - //////////////////////////////////////// - // Performance monitoring - //////////////////////////////////////// - void Report(void); - void ZeroCounters(void); - double DhopTotalTime; - double DhopCalls; - double DhopCommTime; - double DhopComputeTime; - double DhopComputeTime2; - double DhopFaceTime; - /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// From e8a0a1e75dbebf3cd38b5d482ae1681d363012fa Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:09:47 -0400 Subject: [PATCH 111/240] Tracing replaces self timing hooks --- Grid/qcd/action/fermion/NaiveStaggeredFermion.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h index ca38a64f..5f69c2b1 100644 --- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h +++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h @@ -47,18 +47,6 @@ public: FermionField _tmp; FermionField &tmp(void) { return _tmp; } - //////////////////////////////////////// - // Performance monitoring - //////////////////////////////////////// - void Report(void); - void ZeroCounters(void); - double DhopTotalTime; - double DhopCalls; - double DhopCommTime; - double DhopComputeTime; - double DhopComputeTime2; - double DhopFaceTime; - /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// From 7c686d29c98fbe0e9fbb915696af3ca579593718 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:10:17 -0400 Subject: [PATCH 112/240] Tracing replaces self timing --- Grid/qcd/action/fermion/WilsonCompressor.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 324283df..de2f1979 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -294,11 +294,7 @@ public: typedef typename Base::View_type View_type; typedef typename Base::StencilVector StencilVector; - void ZeroCountersi(void) { } - void Reporti(int calls) { } - // Vector surface_list; - WilsonStencil(GridBase *grid, int npoints, int checkerboard, @@ -306,7 +302,6 @@ public: const std::vector &distances,Parameters p) : CartesianStencil (grid,npoints,checkerboard,directions,distances,p) { - ZeroCountersi(); // surface_list.resize(0); this->same_node.resize(npoints); }; From ee2d7369b3d0a1602ee6d85ac638299af6036510 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:10:45 -0400 Subject: [PATCH 113/240] Tracing replaces self timing --- Grid/qcd/action/fermion/WilsonFermion.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index bf8926d0..a7a1bb69 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -74,20 +74,6 @@ public: FermionField _tmp; FermionField &tmp(void) { return _tmp; } - void Report(void); - void ZeroCounters(void); - double DhopCalls; - double DhopCommTime; - double DhopComputeTime; - double DhopComputeTime2; - double DhopFaceTime; - double DhopTotalTime; - - double DerivCalls; - double DerivCommTime; - double DerivComputeTime; - double DerivDhopComputeTime; - ////////////////////////////////////////////////////////////////// // override multiply; cut number routines if pass dagger argument // and also make interface more uniformly consistent From 24182ca8bf16b21373ddcd70621fbf3fdf9de8f1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:11:18 -0400 Subject: [PATCH 114/240] HIP allows conserved currents. Tracing replaces self timeing --- .../CayleyFermion5DImplementation.h | 57 +------------------ 1 file changed, 1 insertion(+), 56 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index 51a7990c..208e5f81 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -152,58 +152,6 @@ void CayleyFermion5D::DminusDag(const FermionField &psi, FermionField &chi } } -template void CayleyFermion5D::CayleyReport(void) -{ - this->Report(); - Coordinate latt = GridDefaultLatt(); - RealD volume = this->Ls; for(int mu=0;mu_FourDimGrid->_Nprocessors; - if ( M5Dcalls > 0 ) { - std::cout << GridLogMessage << "#### M5D calls report " << std::endl; - std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl; - std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl; - - // Flops = 10.0*(Nc*Ns) *Ls*vol - RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; - - // Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting) - // read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 ) - // write = 1 - RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9; - std::cout << GridLogMessage << "Average bandwidth (GB/s) : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl; - } - - if ( MooeeInvCalls > 0 ) { - - std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; - std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; - std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; -#ifdef GRID_CUDA - RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; -#else - // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex - RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; -#endif - } - -} -template void CayleyFermion5D::CayleyZeroCounters(void) -{ - this->ZeroCounters(); - M5Dflops=0; - M5Dcalls=0; - M5Dtime=0; - MooeeInvFlops=0; - MooeeInvCalls=0; - MooeeInvTime=0; -} - template void CayleyFermion5D::M5D (const FermionField &psi, FermionField &chi) { @@ -646,7 +594,6 @@ void CayleyFermion5D::ContractConservedCurrent( PropagatorField &q_in_1, assert(mass_plus == mass_minus); RealD mass = mass_plus; -#if (!defined(GRID_HIP)) Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -765,7 +712,7 @@ void CayleyFermion5D::ContractConservedCurrent( PropagatorField &q_in_1, else q_out += C; } -#endif + } template @@ -832,7 +779,6 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } #endif -#if (!defined(GRID_HIP)) int tshift = (mu == Nd-1) ? 1 : 0; unsigned int LLt = GridDefaultLatt()[Tp]; //////////////////////////////////////////////// @@ -952,7 +898,6 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, InsertSlice(L_Q, q_out, s , 0); } -#endif } #undef Pp #undef Pm From 111b30ca1d3045a27b094ea07c150bd5f9e96ea8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:11:48 -0400 Subject: [PATCH 115/240] Tracing replaces self timing --- .../implementation/CayleyFermion5Dcache.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 1581bee4..0d2516c4 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -63,9 +63,6 @@ CayleyFermion5D::M5D(const FermionField &psi_i, // 10 = 3 complex mult + 2 complex add // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) - M5Dcalls++; - M5Dtime-=usecond(); - uint64_t nloop = grid->oSites(); accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t s = sss%Ls; @@ -78,7 +75,6 @@ CayleyFermion5D::M5D(const FermionField &psi_i, spProj5p(tmp2,psi(idx_l)); coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2); }); - M5Dtime+=usecond(); } template @@ -104,9 +100,6 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, int Ls=this->Ls; // Flops = 6.0*(Nc*Ns) *Ls*vol - M5Dcalls++; - M5Dtime-=usecond(); - uint64_t nloop = grid->oSites(); accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t s = sss%Ls; @@ -119,7 +112,6 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, spProj5m(tmp2,psi(idx_l)); coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2); }); - M5Dtime+=usecond(); } template @@ -140,8 +132,6 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi auto pleem = & leem[0]; auto pueem = & ueem[0]; - MooeeInvCalls++; - MooeeInvTime-=usecond(); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; @@ -178,8 +168,6 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi coalescedWrite(chi[ss+s],res); } }); - - MooeeInvTime+=usecond(); } @@ -202,10 +190,6 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi assert(psi.Checkerboard() == psi.Checkerboard()); - MooeeInvCalls++; - MooeeInvTime-=usecond(); - - uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; @@ -242,7 +226,6 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi coalescedWrite(chi[ss+s],res); } }); - MooeeInvTime+=usecond(); } From 9bff188f028b04d1f6a739afe861fc6505c67b1c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:12:05 -0400 Subject: [PATCH 116/240] Tracing replaces self timing --- .../fermion/implementation/CayleyFermion5Dvec.h | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h index b54f63ad..e3bf67db 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h @@ -94,10 +94,6 @@ CayleyFermion5D::M5D(const FermionField &psi_i, d_p[ss] = diag[s]; }} - - M5Dcalls++; - M5Dtime-=usecond(); - assert(Nc==3); thread_loop( (int ss=0;ssoSites();ss+=LLs),{ // adds LLs @@ -198,7 +194,6 @@ CayleyFermion5D::M5D(const FermionField &psi_i, } #endif }); - M5Dtime+=usecond(); } template @@ -242,8 +237,6 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, d_p[ss] = diag[s]; }} - M5Dcalls++; - M5Dtime-=usecond(); thread_loop( (int ss=0;ssoSites();ss+=LLs),{ // adds LLs #if 0 alignas(64) SiteHalfSpinor hp; @@ -339,7 +332,6 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, } #endif }); - M5Dtime+=usecond(); } @@ -813,9 +805,6 @@ CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi, } assert(_Matp->size()==Ls*LLs); - MooeeInvCalls++; - MooeeInvTime-=usecond(); - if ( switcheroo::iscomplex() ) { thread_loop( (auto site=0;site::MooeeInternal(const FermionField &psi, FermionField &chi, MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm); }); } - MooeeInvTime+=usecond(); + } NAMESPACE_END(Grid); From e6dcb821adacbb2eebdfc95732066eab48db516a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:12:31 -0400 Subject: [PATCH 117/240] Tracing replaces self timing --- .../implementation/DomainWallEOFAFermionCache.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 9a8454ef..6b8336cc 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -54,8 +54,6 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi auto pupper = &upper[0]; auto plower = &lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol - this->M5Dcalls++; - this->M5Dtime -= usecond(); auto nloop=grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -71,7 +69,6 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi } }); - this->M5Dtime += usecond(); } template @@ -91,8 +88,6 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio auto plower = &lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol - this->M5Dcalls++; - this->M5Dtime -= usecond(); auto nloop=grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -108,7 +103,6 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio } }); - this->M5Dtime += usecond(); } template @@ -127,8 +121,6 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie auto pleem = & this->leem[0]; auto pueem = & this->ueem[0]; - this->MooeeInvCalls++; - this->MooeeInvTime -= usecond(); uint64_t nloop=grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; @@ -164,7 +156,6 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie coalescedWrite(chi[ss+s],res); } }); - this->MooeeInvTime += usecond(); } template @@ -185,8 +176,6 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion assert(psi.Checkerboard() == psi.Checkerboard()); - this->MooeeInvCalls++; - this->MooeeInvTime -= usecond(); auto nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; @@ -223,7 +212,6 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion } }); - this->MooeeInvTime += usecond(); } NAMESPACE_END(Grid); From 8a9e6471208a152e074ff0b4c0bb3d4090e5286f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:13:44 -0400 Subject: [PATCH 118/240] Tracing replaces self timing --- Grid/qcd/action/fermion/WilsonFermion5D.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index eced6b81..0b07d320 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -78,21 +78,6 @@ public: int Dirichlet; Coordinate Block; - /********** Deprecate timers **********/ - void Report(void); - void ZeroCounters(void); - double DhopCalls; - double DhopCommTime; - double DhopComputeTime; - double DhopComputeTime2; - double DhopFaceTime; - double DhopTotalTime; - - double DerivCalls; - double DerivCommTime; - double DerivComputeTime; - double DerivDhopComputeTime; - /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// From db0fe6ddbb13aa16016753d4ec480c38f1059c51 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:14:14 -0400 Subject: [PATCH 119/240] Tracing replaces self timinng --- ...ImprovedStaggeredFermion5DImplementation.h | 75 ------------------- 1 file changed, 75 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index 888691c4..d235abbb 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -298,45 +298,33 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & int LLs = in.Grid()->_rdimensions[0]; int len = U.Grid()->oSites(); - DhopFaceTime-=usecond(); st.Prepare(); st.HaloGather(in,compressor); - DhopFaceTime+=usecond(); - DhopCommTime -=usecond(); std::vector > requests; st.CommunicateBegin(requests); // st.HaloExchangeOptGather(in,compressor); // Wilson compressor - DhopFaceTime-=usecond(); st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms - DhopFaceTime+=usecond(); ////////////////////////////////////////////////////////////////////////////////////////////////////// // Remove explicit thread mapping introduced for OPA reasons. ////////////////////////////////////////////////////////////////////////////////////////////////////// - DhopComputeTime-=usecond(); { int interior=1; int exterior=0; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } - DhopComputeTime+=usecond(); - DhopFaceTime-=usecond(); st.CommsMerge(compressor); - DhopFaceTime+=usecond(); st.CommunicateComplete(requests); - DhopCommTime +=usecond(); - DhopComputeTime2-=usecond(); { int interior=0; int exterior=1; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } - DhopComputeTime2+=usecond(); } template @@ -347,22 +335,14 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, Compressor compressor; int LLs = in.Grid()->_rdimensions[0]; - //double t1=usecond(); - DhopTotalTime -= usecond(); - DhopCommTime -= usecond(); st.HaloExchange(in,compressor); - DhopCommTime += usecond(); - DhopComputeTime -= usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion { int interior=1; int exterior=1; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } - DhopComputeTime += usecond(); - DhopTotalTime += usecond(); - } /*CHANGE END*/ @@ -371,7 +351,6 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, template void ImprovedStaggeredFermion5D::DhopOE(const FermionField &in, FermionField &out,int dag) { - DhopCalls+=1; conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid conformable(in.Grid(),out.Grid()); // drops the cb check @@ -383,7 +362,6 @@ void ImprovedStaggeredFermion5D::DhopOE(const FermionField &in, FermionFie template void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) { - DhopCalls+=1; conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid conformable(in.Grid(),out.Grid()); // drops the cb check @@ -395,7 +373,6 @@ void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionFie template void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) { - DhopCalls+=2; conformable(in.Grid(),FermionGrid()); // verifies full grid conformable(in.Grid(),out.Grid()); @@ -404,58 +381,6 @@ void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); } -template -void ImprovedStaggeredFermion5D::Report(void) -{ - Coordinate latt = GridDefaultLatt(); - RealD volume = Ls; for(int mu=0;mu_Nprocessors; - RealD NN = _FourDimGrid->NodeCount(); - - std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; - - std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls : " - << DhopCalls << std::endl; - std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime /Calls : " - << DhopTotalTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime /Calls : " - << DhopCommTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls : " - << DhopComputeTime / DhopCalls << " us" << std::endl; - - // Average the compute time - _FourDimGrid->GlobalSum(DhopComputeTime); - DhopComputeTime/=NP; - - RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; - - RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; - - std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil" < -void ImprovedStaggeredFermion5D::ZeroCounters(void) -{ - DhopCalls = 0; - DhopTotalTime = 0; - DhopCommTime = 0; - DhopComputeTime = 0; - DhopFaceTime = 0; - - - Stencil.ZeroCounters(); - StencilEven.ZeroCounters(); - StencilOdd.ZeroCounters(); -} - ///////////////////////////////////////////////////////////////////////// // Implement the general interface. Here we use SAME mass on all slices ///////////////////////////////////////////////////////////////////////// From efee33c55df37c425fbf175b0ebdcac610eb2331 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:14:57 -0400 Subject: [PATCH 120/240] Tracing replaces self timing --- .../ImprovedStaggeredFermionImplementation.h | 79 ------------------- 1 file changed, 79 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index 05d9a17e..4c80a1d5 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -334,7 +334,6 @@ void ImprovedStaggeredFermion::DhopDerivEO(GaugeField &mat, const FermionF template void ImprovedStaggeredFermion::Dhop(const FermionField &in, FermionField &out, int dag) { - DhopCalls+=2; conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -346,7 +345,6 @@ void ImprovedStaggeredFermion::Dhop(const FermionField &in, FermionField & template void ImprovedStaggeredFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { - DhopCalls+=1; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -359,7 +357,6 @@ void ImprovedStaggeredFermion::DhopOE(const FermionField &in, FermionField template void ImprovedStaggeredFermion::DhopEO(const FermionField &in, FermionField &out, int dag) { - DhopCalls+=1; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -418,47 +415,33 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st Compressor compressor; int len = U.Grid()->oSites(); - DhopTotalTime -= usecond(); - - DhopFaceTime -= usecond(); st.Prepare(); st.HaloGather(in,compressor); - DhopFaceTime += usecond(); - DhopCommTime -=usecond(); std::vector > requests; st.CommunicateBegin(requests); - DhopFaceTime-=usecond(); st.CommsMergeSHM(compressor); - DhopFaceTime+= usecond(); ////////////////////////////////////////////////////////////////////////////////////////////////////// // Removed explicit thread comms ////////////////////////////////////////////////////////////////////////////////////////////////////// - DhopComputeTime -= usecond(); { int interior=1; int exterior=0; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } - DhopComputeTime += usecond(); st.CommunicateComplete(requests); - DhopCommTime +=usecond(); // First to enter, last to leave timing - DhopFaceTime -= usecond(); st.CommsMerge(compressor); - DhopFaceTime -= usecond(); - DhopComputeTime2 -= usecond(); { int interior=0; int exterior=1; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } - DhopComputeTime2 += usecond(); } @@ -471,78 +454,16 @@ void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Le { assert((dag == DaggerNo) || (dag == DaggerYes)); - DhopTotalTime -= usecond(); - - DhopCommTime -= usecond(); Compressor compressor; st.HaloExchange(in, compressor); - DhopCommTime += usecond(); - DhopComputeTime -= usecond(); { int interior=1; int exterior=1; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } - DhopComputeTime += usecond(); - DhopTotalTime += usecond(); }; - //////////////////////////////////////////////////////////////// - // Reporting - //////////////////////////////////////////////////////////////// -template -void ImprovedStaggeredFermion::Report(void) -{ - Coordinate latt = _grid->GlobalDimensions(); - RealD volume = 1; for(int mu=0;mu_Nprocessors; - RealD NN = _grid->NodeCount(); - - std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; - - std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : " - << DhopCalls << std::endl; - std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : " - << DhopTotalTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : " - << DhopCommTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : " - << DhopComputeTime / DhopCalls << " us" << std::endl; - - // Average the compute time - _grid->GlobalSum(DhopComputeTime); - DhopComputeTime/=NP; - - RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; - - RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; - - std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" < -void ImprovedStaggeredFermion::ZeroCounters(void) -{ - DhopCalls = 0; - DhopTotalTime = 0; - DhopCommTime = 0; - DhopComputeTime = 0; - DhopFaceTime = 0; - - Stencil.ZeroCounters(); - StencilEven.ZeroCounters(); - StencilOdd.ZeroCounters(); -} - - //////////////////////////////////////////////////////// // Conserved current - not yet implemented. //////////////////////////////////////////////////////// From abfaa00d3e27521ac3314a5ddb61e7f4ca54a48d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:15:24 -0400 Subject: [PATCH 121/240] Tracing replaces self timing --- .../implementation/MobiusEOFAFermionCache.h | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index 41b9170d..617a18df 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -55,9 +55,6 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField auto plower = &lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol - this->M5Dcalls++; - this->M5Dtime -= usecond(); - int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss = sss*Ls; @@ -73,7 +70,6 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField } }); - this->M5Dtime += usecond(); } template @@ -99,9 +95,6 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion auto pshift_coeffs = &shift_coeffs[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol - this->M5Dcalls++; - this->M5Dtime -= usecond(); - int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss = sss*Ls; @@ -122,7 +115,6 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion } }); - this->M5Dtime += usecond(); } template @@ -143,9 +135,6 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie auto plower = &lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol - this->M5Dcalls++; - this->M5Dtime -= usecond(); - int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(), { uint64_t ss = sss*Ls; @@ -161,8 +150,6 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); } }); - - this->M5Dtime += usecond(); } template @@ -186,9 +173,6 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm auto pshift_coeffs = &shift_coeffs[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol - this->M5Dcalls++; - this->M5Dtime -= usecond(); - auto pm = this->pm; int nloop = grid->oSites()/Ls; @@ -217,7 +201,6 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm } }); - this->M5Dtime += usecond(); } template @@ -237,9 +220,6 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } - this->MooeeInvCalls++; - this->MooeeInvTime -= usecond(); - int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; @@ -277,7 +257,6 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & } }); - this->MooeeInvTime += usecond(); } template @@ -297,8 +276,6 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF auto pueem= & this->ueem[0]; auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0]; auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0]; - this->MooeeInvCalls++; - this->MooeeInvTime -= usecond(); int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -343,7 +320,6 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF } }); - this->MooeeInvTime += usecond(); } template @@ -363,9 +339,6 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel auto pleem= & this->leem[0]; auto pueem= & this->ueem[0]; - this->MooeeInvCalls++; - this->MooeeInvTime -= usecond(); - int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; @@ -402,7 +375,6 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel coalescedWrite(chi[ss+s],res); } }); - this->MooeeInvTime += usecond(); } template @@ -423,9 +395,6 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; - this->MooeeInvCalls++; - this->MooeeInvTime -= usecond(); - int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; @@ -469,7 +438,6 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi } }); - this->MooeeInvTime += usecond(); } NAMESPACE_END(Grid); From 21371a7e5b5bd15a8a17f05f52b0d572312823ca Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:16:05 -0400 Subject: [PATCH 122/240] Tracing replaces self timing --- .../NaiveStaggeredFermionImplementation.h | 79 ------------------- 1 file changed, 79 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h index 788e02cf..bf23d99d 100644 --- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h @@ -263,7 +263,6 @@ void NaiveStaggeredFermion::DhopDerivEO(GaugeField &mat, const FermionFiel template void NaiveStaggeredFermion::Dhop(const FermionField &in, FermionField &out, int dag) { - DhopCalls+=2; conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -275,7 +274,6 @@ void NaiveStaggeredFermion::Dhop(const FermionField &in, FermionField &out template void NaiveStaggeredFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { - DhopCalls+=1; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -288,7 +286,6 @@ void NaiveStaggeredFermion::DhopOE(const FermionField &in, FermionField &o template void NaiveStaggeredFermion::DhopEO(const FermionField &in, FermionField &out, int dag) { - DhopCalls+=1; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -345,47 +342,33 @@ void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, L Compressor compressor; int len = U.Grid()->oSites(); - DhopTotalTime -= usecond(); - - DhopFaceTime -= usecond(); st.Prepare(); st.HaloGather(in,compressor); - DhopFaceTime += usecond(); - DhopCommTime -=usecond(); std::vector > requests; st.CommunicateBegin(requests); - DhopFaceTime-=usecond(); st.CommsMergeSHM(compressor); - DhopFaceTime+= usecond(); ////////////////////////////////////////////////////////////////////////////////////////////////////// // Removed explicit thread comms ////////////////////////////////////////////////////////////////////////////////////////////////////// - DhopComputeTime -= usecond(); { int interior=1; int exterior=0; Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); } - DhopComputeTime += usecond(); st.CommunicateComplete(requests); - DhopCommTime +=usecond(); // First to enter, last to leave timing - DhopFaceTime -= usecond(); st.CommsMerge(compressor); - DhopFaceTime -= usecond(); - DhopComputeTime2 -= usecond(); { int interior=0; int exterior=1; Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); } - DhopComputeTime2 += usecond(); } template @@ -396,78 +379,16 @@ void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Lebes { assert((dag == DaggerNo) || (dag == DaggerYes)); - DhopTotalTime -= usecond(); - - DhopCommTime -= usecond(); Compressor compressor; st.HaloExchange(in, compressor); - DhopCommTime += usecond(); - DhopComputeTime -= usecond(); { int interior=1; int exterior=1; Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); } - DhopComputeTime += usecond(); - DhopTotalTime += usecond(); }; - //////////////////////////////////////////////////////////////// - // Reporting - //////////////////////////////////////////////////////////////// -template -void NaiveStaggeredFermion::Report(void) -{ - Coordinate latt = _grid->GlobalDimensions(); - RealD volume = 1; for(int mu=0;mu_Nprocessors; - RealD NN = _grid->NodeCount(); - - std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; - - std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : " - << DhopCalls << std::endl; - std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : " - << DhopTotalTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : " - << DhopCommTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : " - << DhopComputeTime / DhopCalls << " us" << std::endl; - - // Average the compute time - _grid->GlobalSum(DhopComputeTime); - DhopComputeTime/=NP; - - RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; - - RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; - - std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" < -void NaiveStaggeredFermion::ZeroCounters(void) -{ - DhopCalls = 0; - DhopTotalTime = 0; - DhopCommTime = 0; - DhopComputeTime = 0; - DhopFaceTime = 0; - - Stencil.ZeroCounters(); - StencilEven.ZeroCounters(); - StencilOdd.ZeroCounters(); -} - - //////////////////////////////////////////////////////// // Conserved current - not yet implemented. //////////////////////////////////////////////////////// From fd33c835dd91f1cab44f3569c462f53c742a82df Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:18:17 -0400 Subject: [PATCH 123/240] Feynman rule fix and tracing replaces self timing --- .../WilsonFermion5DImplementation.h | 256 +++++------------- 1 file changed, 67 insertions(+), 189 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 44c69583..388094b2 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -103,8 +103,6 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, Block = block; } - ZeroCounters(); - if (Impl::LsVectorised) { int nsimd = Simd::Nsimd(); @@ -143,89 +141,6 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, // <<" " << StencilEven.surface_list.size()< -void WilsonFermion5D::Report(void) -{ - RealD NP = _FourDimGrid->_Nprocessors; - RealD NN = _FourDimGrid->NodeCount(); - RealD volume = Ls; - Coordinate latt = _FourDimGrid->GlobalDimensions(); - for(int mu=0;mu 0 ) { - std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Number of DhopEO Calls : " << DhopCalls << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion5D ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl; - - // Average the compute time - _FourDimGrid->GlobalSum(DhopComputeTime); - DhopComputeTime/=NP; - RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; - - RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; - - } - - if ( DerivCalls > 0 ) { - std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; - std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " < 0 || DhopCalls > 0){ - std::cout << GridLogMessage << "WilsonFermion5D Stencil" < 0){ - std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" < -void WilsonFermion5D::ZeroCounters(void) { - DhopCalls = 0; - DhopCommTime = 0; - DhopComputeTime = 0; - DhopComputeTime2= 0; - DhopFaceTime = 0; - DhopTotalTime = 0; - - DerivCalls = 0; - DerivCommTime = 0; - DerivComputeTime = 0; - DerivDhopComputeTime = 0; - - Stencil.ZeroCounters(); - StencilEven.ZeroCounters(); - StencilOdd.ZeroCounters(); - Stencil.ZeroCountersi(); - StencilEven.ZeroCountersi(); - StencilOdd.ZeroCountersi(); -} - template void WilsonFermion5D::ImportGauge(const GaugeField &_Umu) @@ -281,7 +196,6 @@ void WilsonFermion5D::DerivInternal(StencilImpl & st, const FermionField &B, int dag) { - DerivCalls++; assert((dag==DaggerNo) ||(dag==DaggerYes)); conformable(st.Grid(),A.Grid()); @@ -292,15 +206,12 @@ void WilsonFermion5D::DerivInternal(StencilImpl & st, FermionField Btilde(B.Grid()); FermionField Atilde(B.Grid()); - DerivCommTime-=usecond(); st.HaloExchange(B,compressor); - DerivCommTime+=usecond(); Atilde=A; int LLs = B.Grid()->_rdimensions[0]; - DerivComputeTime-=usecond(); for (int mu = 0; mu < Nd; mu++) { //////////////////////////////////////////////////////////////////////// // Flip gamma if dag @@ -312,8 +223,6 @@ void WilsonFermion5D::DerivInternal(StencilImpl & st, // Call the single hop //////////////////////// - DerivDhopComputeTime -= usecond(); - int Usites = U.Grid()->oSites(); Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma); @@ -321,10 +230,8 @@ void WilsonFermion5D::DerivInternal(StencilImpl & st, //////////////////////////// // spin trace outer product //////////////////////////// - DerivDhopComputeTime += usecond(); Impl::InsertForce5D(mat, Btilde, Atilde, mu); } - DerivComputeTime += usecond(); } template @@ -382,14 +289,10 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { - // std::cout << GridLogDslash<<"Dhop internal"<::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { + GRID_TRACE("DhopInternalOverlappedComms"); Compressor compressor(dag); int LLs = in.Grid()->_rdimensions[0]; @@ -406,59 +310,58 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg ///////////////////////////// // Start comms // Gather intranode and extra node differentiated?? ///////////////////////////// - DhopFaceTime=-usecond(); - st.HaloExchangeOptGather(in,compressor); - DhopFaceTime+=usecond(); - // std::cout << GridLogDslash<< " Dhop Gather end "<< DhopFaceTime<<" us " < > requests; + auto id=traceStart("Communicate overlapped"); st.CommunicateBegin(requests); ///////////////////////////// // Overlap with comms ///////////////////////////// - DhopFaceTime=-usecond(); - st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms - DhopFaceTime+=usecond(); - // std::cout << GridLogDslash<< " Dhop Commsmerge end "<::DhopInternalSerialComms(StencilImpl & st, LebesgueOr const FermionField &in, FermionField &out,int dag) { + GRID_TRACE("DhopInternalSerialComms"); Compressor compressor(dag); int LLs = in.Grid()->_rdimensions[0]; - // std::cout << GridLogDslash<< " Dhop Halo exchange begine " < void WilsonFermion5D::DhopOE(const FermionField &in, FermionField &out,int dag) { - DhopCalls++; conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid conformable(in.Grid(),out.Grid()); // drops the cb check @@ -505,7 +406,6 @@ void WilsonFermion5D::DhopOE(const FermionField &in, FermionField &out,int template void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) { - DhopCalls++; conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid conformable(in.Grid(),out.Grid()); // drops the cb check @@ -517,7 +417,6 @@ void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int template void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) { - DhopCalls+=2; conformable(in.Grid(),FermionGrid()); // verifies full grid conformable(in.Grid(),out.Grid()); @@ -572,12 +471,17 @@ void WilsonFermion5D::MomentumSpacePropagatorHt_5d(FermionField &out,const LatComplex sk(_grid); sk = Zero(); LatComplex sk2(_grid); sk2= Zero(); LatComplex W(_grid); W= Zero(); - LatComplex a(_grid); a= Zero(); LatComplex one (_grid); one = ScalComplex(1.0,0.0); LatComplex cosha(_grid); LatComplex kmu(_grid); LatComplex Wea(_grid); LatComplex Wema(_grid); + LatComplex ea(_grid); + LatComplex ema(_grid); + LatComplex eaLs(_grid); + LatComplex emaLs(_grid); + LatComplex ea2Ls(_grid); + LatComplex ema2Ls(_grid); LatComplex sinha(_grid); LatComplex sinhaLs(_grid); LatComplex coshaLs(_grid); @@ -612,39 +516,29 @@ void WilsonFermion5D::MomentumSpacePropagatorHt_5d(FermionField &out,const //////////////////////////////////////////// cosha = (one + W*W + sk) / (abs(W)*2.0); - // FIXME Need a Lattice acosh - - { - autoView(cosha_v,cosha,CpuRead); - autoView(a_v,a,CpuWrite); - for(int idx=0;idx<_grid->lSites();idx++){ - Coordinate lcoor(Nd); - Tcomplex cc; - // RealD sgn; - _grid->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(cc,cosha_v,lcoor); - assert((double)real(cc)>=1.0); - assert(fabs((double)imag(cc))<=1.0e-15); - cc = ScalComplex(::acosh(real(cc)),0.0); - pokeLocalSite(cc,a_v,lcoor); - } - } - - Wea = ( exp( a) * abs(W) ); - Wema= ( exp(-a) * abs(W) ); - sinha = 0.5*(exp( a) - exp(-a)); - sinhaLs = 0.5*(exp( a*Ls) - exp(-a*Ls)); - coshaLs = 0.5*(exp( a*Ls) + exp(-a*Ls)); + ea = (cosha + sqrt(cosha*cosha-one)); + ema= (cosha - sqrt(cosha*cosha-one)); + eaLs = pow(ea,Ls); + emaLs= pow(ema,Ls); + ea2Ls = pow(ea,2.0*Ls); + ema2Ls= pow(ema,2.0*Ls); + Wea= abs(W) * ea; + Wema= abs(W) * ema; + // a=log(ea); + + sinha = 0.5*(ea - ema); + sinhaLs = 0.5*(eaLs-emaLs); + coshaLs = 0.5*(eaLs+emaLs); A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0); - F = exp( a*Ls) * (one - Wea + (Wema - one) * mass*mass); - F = F + exp(-a*Ls) * (Wema - one + (one - Wea) * mass*mass); + F = eaLs * (one - Wea + (Wema - one) * mass*mass); + F = F + emaLs * (Wema - one + (one - Wea) * mass*mass); F = F - abs(W) * sinha * 4.0 * mass; - Bpp = (A/F) * (exp(-a*Ls*2.0) - one) * (one - Wema) * (one - mass*mass * one); - Bmm = (A/F) * (one - exp(a*Ls*2.0)) * (one - Wea) * (one - mass*mass * one); - App = (A/F) * (exp(-a*Ls*2.0) - one) * exp(-a) * (exp(-a) - abs(W)) * (one - mass*mass * one); - Amm = (A/F) * (one - exp(a*Ls*2.0)) * exp(a) * (exp(a) - abs(W)) * (one - mass*mass * one); + Bpp = (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one); + Bmm = (A/F) * (one - ea2Ls) * (one - Wea) * (one - mass*mass * one); + App = (A/F) * (ema2Ls - one) * ema * (ema - abs(W)) * (one - mass*mass * one); + Amm = (A/F) * (one - ea2Ls) * ea * (ea - abs(W)) * (one - mass*mass * one); ABpm = (A/F) * abs(W) * sinha * 2.0 * (one + mass * coshaLs * 2.0 + mass*mass * one); //P+ source, P- source @@ -667,29 +561,29 @@ void WilsonFermion5D::MomentumSpacePropagatorHt_5d(FermionField &out,const buf1_4d = Zero(); ExtractSlice(buf1_4d, PRsource, (tt-1), 0); //G(s,t) - bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d; + bufR_4d = bufR_4d + A * eaLs * pow(ema,f) * signW * buf1_4d + A * emaLs * pow(ea,f) * signW * buf1_4d; //A++*exp(a(s+t)) - bufR_4d = bufR_4d + App * exp(a*ss) * exp(a*tt) * signW * buf1_4d ; + bufR_4d = bufR_4d + App * pow(ea,ss) * pow(ea,tt) * signW * buf1_4d ; //A+-*exp(a(s-t)) - bufR_4d = bufR_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf1_4d ; + bufR_4d = bufR_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf1_4d ; //A-+*exp(a(-s+t)) - bufR_4d = bufR_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf1_4d ; + bufR_4d = bufR_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf1_4d ; //A--*exp(a(-s-t)) - bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ; + bufR_4d = bufR_4d + Amm * pow(ema,ss) * pow(ema,tt) * signW * buf1_4d ; //GL buf2_4d = Zero(); ExtractSlice(buf2_4d, PLsource, (tt-1), 0); //G(s,t) - bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d; + bufL_4d = bufL_4d + A * eaLs * pow(ema,f) * signW * buf2_4d + A * emaLs * pow(ea,f) * signW * buf2_4d; //B++*exp(a(s+t)) - bufL_4d = bufL_4d + Bpp * exp(a*ss) * exp(a*tt) * signW * buf2_4d ; + bufL_4d = bufL_4d + Bpp * pow(ea,ss) * pow(ea,tt) * signW * buf2_4d ; //B+-*exp(a(s-t)) - bufL_4d = bufL_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf2_4d ; + bufL_4d = bufL_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf2_4d ; //B-+*exp(a(-s+t)) - bufL_4d = bufL_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf2_4d ; + bufL_4d = bufL_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf2_4d ; //B--*exp(a(-s-t)) - bufL_4d = bufL_4d + Bmm * exp(-a*ss) * exp(-a*tt) * signW * buf2_4d ; + bufL_4d = bufL_4d + Bmm * pow(ema,ss) * pow(ema,tt) * signW * buf2_4d ; } InsertSlice(bufR_4d, GR, (ss-1), 0); InsertSlice(bufL_4d, GL, (ss-1), 0); @@ -808,28 +702,12 @@ void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const Fe W = one - M5 + sk2; //////////////////////////////////////////// - // Cosh alpha -> alpha + // Cosh alpha -> exp(+/- alpha) //////////////////////////////////////////// cosha = (one + W*W + sk) / (abs(W)*2.0); - // FIXME Need a Lattice acosh - { - autoView(cosha_v,cosha,CpuRead); - autoView(a_v,a,CpuWrite); - for(int idx=0;idx<_grid->lSites();idx++){ - Coordinate lcoor(Nd); - Tcomplex cc; - // RealD sgn; - _grid->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(cc,cosha_v,lcoor); - assert((double)real(cc)>=1.0); - assert(fabs((double)imag(cc))<=1.0e-15); - cc = ScalComplex(::acosh(real(cc)),0.0); - pokeLocalSite(cc,a_v,lcoor); - }} - - Wea = ( exp( a) * abs(W) ); - Wema= ( exp(-a) * abs(W) ); + Wea = abs(W)*(cosha + sqrt(cosha*cosha-one)); + Wema= abs(W)*(cosha - sqrt(cosha*cosha-one)); num = num + ( one - Wema ) * mass * in; denom= ( Wea - one ) + mass*mass * (one - Wema); From bb049847d5506829f63951fdb812cc4db930dda7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:19:02 -0400 Subject: [PATCH 124/240] Tracing replaces self timing --- .../WilsonFermionImplementation.h | 144 ++++-------------- 1 file changed, 26 insertions(+), 118 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index c958019d..2833fdc4 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -76,91 +76,6 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, StencilOdd.BuildSurfaceList(1,vol4); } -template -void WilsonFermion::Report(void) -{ - RealD NP = _grid->_Nprocessors; - RealD NN = _grid->NodeCount(); - RealD volume = 1; - Coordinate latt = _grid->GlobalDimensions(); - for(int mu=0;mu 0 ) { - std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; - std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl; - std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; - std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl; - - // Average the compute time - _grid->GlobalSum(DhopComputeTime); - DhopComputeTime/=NP; - RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; - - RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting - std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; - std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; - - } - - if ( DerivCalls > 0 ) { - std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; - std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " < 0 || DhopCalls > 0){ - std::cout << GridLogMessage << "WilsonFermion Stencil" < 0){ - std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" < -void WilsonFermion::ZeroCounters(void) { - DhopCalls = 0; // ok - DhopCommTime = 0; - DhopComputeTime = 0; - DhopComputeTime2= 0; - DhopFaceTime = 0; - DhopTotalTime = 0; - - DerivCalls = 0; // ok - DerivCommTime = 0; - DerivComputeTime = 0; - DerivDhopComputeTime = 0; - - Stencil.ZeroCounters(); - StencilEven.ZeroCounters(); - StencilOdd.ZeroCounters(); - Stencil.ZeroCountersi(); - StencilEven.ZeroCountersi(); - StencilOdd.ZeroCountersi(); -} - - template void WilsonFermion::ImportGauge(const GaugeField &_Umu) { @@ -320,7 +235,6 @@ template void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, const FermionField &A, const FermionField &B, int dag) { - DerivCalls++; assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); @@ -329,11 +243,8 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, FermionField Atilde(B.Grid()); Atilde = A; - DerivCommTime-=usecond(); st.HaloExchange(B, compressor); - DerivCommTime+=usecond(); - DerivComputeTime-=usecond(); for (int mu = 0; mu < Nd; mu++) { //////////////////////////////////////////////////////////////////////// // Flip gamma (1+g)<->(1-g) if dag @@ -341,7 +252,6 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, int gamma = mu; if (!dag) gamma += Nd; - DerivDhopComputeTime -= usecond(); int Ls=1; Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma); @@ -349,9 +259,7 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, // spin trace outer product ////////////////////////////////////////////////// Impl::InsertForce4D(mat, Btilde, Atilde, mu); - DerivDhopComputeTime += usecond(); } - DerivComputeTime += usecond(); } template @@ -398,7 +306,6 @@ void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, co template void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) { - DhopCalls+=2; conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -410,7 +317,6 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da template void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { - DhopCalls++; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -423,7 +329,6 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int template void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) { - DhopCalls++; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -488,14 +393,12 @@ void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, const FermionField &in, FermionField &out, int dag) { - DhopTotalTime-=usecond(); #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,in,out,dag); else #endif DhopInternalSerial(st,lo,U,in,out,dag); - DhopTotalTime+=usecond(); } template @@ -504,6 +407,7 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO const FermionField &in, FermionField &out, int dag) { + GRID_TRACE("DhopOverlapped"); assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); @@ -514,53 +418,55 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO ///////////////////////////// std::vector > requests; st.Prepare(); - DhopFaceTime-=usecond(); - st.HaloGather(in,compressor); - DhopFaceTime+=usecond(); + { + GRID_TRACE("Gather"); + st.HaloGather(in,compressor); + } - DhopCommTime -=usecond(); + tracePush("Communication"); st.CommunicateBegin(requests); ///////////////////////////// // Overlap with comms ///////////////////////////// - DhopFaceTime-=usecond(); - st.CommsMergeSHM(compressor); - DhopFaceTime+=usecond(); + { + GRID_TRACE("MergeSHM"); + st.CommsMergeSHM(compressor); + } ///////////////////////////// // do the compute interior ///////////////////////////// int Opt = WilsonKernelsStatic::Opt; - DhopComputeTime-=usecond(); if (dag == DaggerYes) { + GRID_TRACE("DhopDagInterior"); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); } else { + GRID_TRACE("DhopInterior"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); } - DhopComputeTime+=usecond(); ///////////////////////////// // Complete comms ///////////////////////////// st.CommunicateComplete(requests); - DhopCommTime +=usecond(); - - DhopFaceTime-=usecond(); - st.CommsMerge(compressor); - DhopFaceTime+=usecond(); + tracePop("Communication"); + { + GRID_TRACE("Merge"); + st.CommsMerge(compressor); + } ///////////////////////////// // do the compute exterior ///////////////////////////// - DhopComputeTime2-=usecond(); if (dag == DaggerYes) { + GRID_TRACE("DhopDagExterior"); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } else { + GRID_TRACE("DhopExterior"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } - DhopComputeTime2+=usecond(); }; @@ -570,20 +476,22 @@ void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, const FermionField &in, FermionField &out, int dag) { + GRID_TRACE("DhopSerial"); assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); - DhopCommTime-=usecond(); - st.HaloExchange(in, compressor); - DhopCommTime+=usecond(); + { + GRID_TRACE("HaloExchange"); + st.HaloExchange(in, compressor); + } - DhopComputeTime-=usecond(); int Opt = WilsonKernelsStatic::Opt; if (dag == DaggerYes) { + GRID_TRACE("DhopDag"); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } else { + GRID_TRACE("Dhop"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } - DhopComputeTime+=usecond(); }; /*Change ends */ From b4f41309015b141cb3b64e8d8df6f69e1add38df Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:20:21 -0400 Subject: [PATCH 125/240] Defer SMP node links until after interior. Allows for DMA overlapping compute --- .../implementation/WilsonKernelsImplementation.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 211edb6c..939fda33 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -72,20 +72,15 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) if (SE->_is_local) { \ int perm= SE->_permute; \ auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ - spProj(chi,tmp); \ - } else if ( st.same_node[Dir] ) { \ - chi = coalescedRead(buf[SE->_offset],lane); \ - } \ - acceleratorSynchronise(); \ - if (SE->_is_local || st.same_node[Dir] ) { \ - Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ - Recon(result, Uchi); \ - } \ + spProj(chi,tmp); \ + Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ + Recon(result, Uchi); \ + } \ acceleratorSynchronise(); #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ - if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + if (!SE->_is_local ) { \ auto chi = coalescedRead(buf[SE->_offset],lane); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Recon(result, Uchi); \ From cdb8fcc269932a5513cd5df96c09521f56bb2df6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:21:33 -0400 Subject: [PATCH 126/240] Width=4 support. This is too broad; hit it on physical point run. Need to change strategy, I think. --- Grid/qcd/action/filters/DDHMCFilter.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Grid/qcd/action/filters/DDHMCFilter.h b/Grid/qcd/action/filters/DDHMCFilter.h index 366b18e8..f2ea358e 100644 --- a/Grid/qcd/action/filters/DDHMCFilter.h +++ b/Grid/qcd/action/filters/DDHMCFilter.h @@ -91,6 +91,19 @@ struct DDHMCFilter: public MomentumFilterBase U_mu = where(mod(coor,B1)==Integer(B1-4),zzz_mu,U_mu); PokeIndex(U, U_mu, mu); } + if ( Width==4) { + U = where(mod(coor,B1)==Integer(B1-4),zzz,U); + U = where(mod(coor,B1)==Integer(B1-3),zzz,U); + U = where(mod(coor,B1)==Integer(B1-2),zzz,U); + U = where(mod(coor,B1)==Integer(B1-1),zzz,U); + U = where(mod(coor,B1)==Integer(0) ,zzz,U); + U = where(mod(coor,B1)==Integer(1) ,zzz,U); + U = where(mod(coor,B1)==Integer(2) ,zzz,U); + U = where(mod(coor,B1)==Integer(3) ,zzz,U); + auto U_mu = PeekIndex(U,mu); + U_mu = where(mod(coor,B1)==Integer(B1-5),zzz_mu,U_mu); + PokeIndex(U, U_mu, mu); + } } } From cf727997357de382780131c5b21106e430931012 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:24:11 -0400 Subject: [PATCH 127/240] Better action naming --- Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h index e93c2f08..9379fae5 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h @@ -85,7 +85,12 @@ NAMESPACE_BEGIN(Grid); PowerNegQuarter.Init(remez,param.tolerance,true); }; - virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";} + virtual std::string action_name(){ + std::stringstream sstream; + sstream<<"OneFlavourRatioRationalPseudoFermionAction(" + < Date: Wed, 31 Aug 2022 17:32:21 -0400 Subject: [PATCH 128/240] Force reporting improved --- Grid/qcd/hmc/integrators/Integrator.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 33a77f32..7f676ce7 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -143,9 +143,10 @@ protected: force = FieldImplementation::projectForce(force); // Ta for gauge fields double end_force = usecond(); + // DumpSliceNorm("force ",force,Nd-1); MomFilter->applyFilter(force); std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR; @@ -286,7 +287,7 @@ public: <<" force max " << as[level].actions.at(actionID)->deriv_max_average() <<" norm " << as[level].actions.at(actionID)->deriv_norm_average() <<" Fdt max " << as[level].actions.at(actionID)->Fdt_max_average() - <<" norm " << as[level].actions.at(actionID)->Fdt_norm_average() + <<" Fdt norm " << as[level].actions.at(actionID)->Fdt_norm_average() <<" calls " << as[level].actions.at(actionID)->deriv_num << std::endl; } From 5abb19eab090c54064b50306b99be2625b6e6a8a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:32:49 -0400 Subject: [PATCH 129/240] Remove self timing --- Grid/stencil/Stencil.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 5b89539e..19b5e6ea 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -1251,10 +1251,6 @@ public: return 0; } - void ZeroCounters(void) { }; - - void Report(void) { }; - }; NAMESPACE_END(Grid); From cd5cf6d6143deaf0a3f973a792a6ea7cd0c24dd0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:33:41 -0400 Subject: [PATCH 130/240] Tracing replaces self timing hooks --- benchmarks/Benchmark_ITT.cc | 2 -- benchmarks/Benchmark_dwf.cc | 4 ---- benchmarks/Benchmark_gparity.cc | 4 ---- benchmarks/Benchmark_mooee.cc | 6 ------ benchmarks/Benchmark_wilson.cc | 2 -- 5 files changed, 18 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 2f7acb56..2b1f6261 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -420,7 +420,6 @@ public: FGrid->Broadcast(0,&ncall,sizeof(ncall)); // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); @@ -589,7 +588,6 @@ public: FGrid->Broadcast(0,&ncall,sizeof(ncall)); // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index c6814cdc..8e1a1028 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -186,7 +186,6 @@ int main (int argc, char ** argv) if (1) { FGrid->Barrier(); - Dw.ZeroCounters(); Dw.Dhop(src,result,0); std::cout<Barrier(); Dw.DhopEO(src_o,r_e,DaggerNo); double t0=usecond(); @@ -328,7 +325,6 @@ int main (int argc, char ** argv) std::cout<Barrier(); - Dw.ZeroCounters(); Dw.Dhop(src,result,0); std::cout<Barrier(); \ - zDw.CayleyZeroCounters(); \ t0=usecond(); \ for(int i=0;iBarrier(); \ - zDw.CayleyReport(); \ std::cout<Barrier(); \ - Dw.CayleyZeroCounters(); \ t0=usecond(); \ for(int i=0;iBarrier(); \ - Dw.CayleyReport(); \ std::cout< Date: Wed, 31 Aug 2022 17:34:09 -0400 Subject: [PATCH 131/240] RocTX, NVTX, text based self profiling --- configure.ac | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/configure.ac b/configure.ac index 528f0125..b26ccc9b 100644 --- a/configure.ac +++ b/configure.ac @@ -128,6 +128,26 @@ case ${ac_LAPACK} in AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; esac +############### tracing +AC_ARG_ENABLE([tracing], + [AC_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer], [enable tracing])], + [ac_TRACING=${enable_tracing}], [ac_TRACING=none]) + +case ${ac_TRACING} in + nvtx) + AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX]) + LIBS="${LIBS} -lnvToolsExt64_1" + ;; + roctx) + AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX]) + LIBS="${LIBS} -lroctx64" + ;; + timer) + AC_DEFINE([GRID_TRACING_TIMER],[1],[use TIMER]);; + *) + AC_DEFINE([GRID_TRACING_NONE],[1],[no tracing]);; +esac + ############### fermions AC_ARG_ENABLE([fermion-reps], [AC_HELP_STRING([--enable-fermion-reps=yes|no], [enable extra fermion representation support])], From 66177bfbe2156f880f99c2935c138b3a3259df95 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 31 Aug 2022 17:35:07 -0400 Subject: [PATCH 132/240] Used in g-2 sign off --- examples/Example_taku1.cc | 479 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 479 insertions(+) create mode 100644 examples/Example_taku1.cc diff --git a/examples/Example_taku1.cc b/examples/Example_taku1.cc new file mode 100644 index 00000000..b3b01d6f --- /dev/null +++ b/examples/Example_taku1.cc @@ -0,0 +1,479 @@ +/* + * Warning: This code illustrative only: not well tested, and not meant for production use + * without regression / tests being applied + */ + +#include + +using namespace std; +using namespace Grid; + +RealD LLscale =1.0; +RealD LCscale =1.0; + +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +void MakePhase(Coordinate mom,LatticeComplex &phase) +{ + GridBase *grid = phase.Grid(); + auto latt_size = grid->GlobalDimensions(); + ComplexD ci(0.0,1.0); + phase=Zero(); + + LatticeComplex coor(phase.Grid()); + for(int mu=0;mu +void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) +{ + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + Integer Iterations = 40; + Real width = 2.0; + Real coeff = (width*width) / Real(4*Iterations); + + Field tmp(U.Grid()); + smeared=unsmeared; + // chi = (1-p^2/2N)^N kronecker + for(int n = 0; n < Iterations; ++n) { + Laplacian.M(smeared,tmp); + smeared = smeared - coeff*tmp; + std::cout << " smear iter " << n<<" " < +void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + GridBase *UGrid = source.Grid(); + GridBase *FGrid = D.FermionGrid(); + bool fiveD = true; //calculate 5d free propagator + RealD mass = D.Mass(); + LatticeFermion src4 (UGrid); + LatticeFermion result4 (UGrid); + LatticeFermion result5(FGrid); + LatticeFermion src5(FGrid); + LatticePropagator prop5(FGrid); + for(int s=0;s(src4,source,s,c); + + D.ImportPhysicalFermionSource(src4,src5); + D.FreePropagator(src5,result5,mass,true); + std::cout<(prop5,result5,s,c); + FermToProp(propagator,result4,s,c); + } + } + + LatticePropagator Vector_mu(UGrid); + LatticeComplex VV (UGrid); + std::vector sumVV; + Gamma::Algebra GammaV[3] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ + }; + for( int mu=0;mu<3;mu++ ) { + Gamma gV(GammaV[mu]); + D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); + VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current + sliceSum(VV,sumVV,Tdir); + int Nt = sumVV.size(); + for(int t=0;t +void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + bool fiveD = false; //calculate 4d free propagator + RealD mass = D.Mass(); + GridBase *UGrid = source.Grid(); + LatticeFermion src4 (UGrid); + LatticeFermion result4 (UGrid); + for(int s=0;s(src4,source,s,c); + D.FreePropagator(src4,result4,mass,false); + FermToProp(propagator,result4,s,c); + } + } +} + +template +void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + GridBase *UGrid = D.GaugeGrid(); + GridBase *FGrid = D.FermionGrid(); + + LatticeFermion src4 (UGrid); + LatticeFermion src5 (FGrid); + LatticeFermion result5(FGrid); + LatticeFermion result4(UGrid); + LatticePropagator prop5(FGrid); + + ConjugateGradient CG(1.0e-10,100000); + SchurRedBlackDiagMooeeSolve schur(CG); + ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors + for(int s=0;s(src4,source,s,c); + + D.ImportPhysicalFermionSource(src4,src5); + + result5=Zero(); + schur(D,src5,result5,ZG); + std::cout<(prop5,result5,s,c); + FermToProp(propagator,result4,s,c); + } + } + LatticePropagator Axial_mu(UGrid); + LatticePropagator Vector_mu(UGrid); + + LatticeComplex PA (UGrid); + LatticeComplex VV (UGrid); + LatticeComplex PJ5q(UGrid); + LatticeComplex PP (UGrid); + + std::vector sumPA; + std::vector sumVV; + std::vector sumPP; + std::vector sumPJ5q; + + Gamma g5(Gamma::Algebra::Gamma5); + D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); + PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current + sliceSum(PA,sumPA,Tdir); + + int Nt{static_cast(sumPA.size())}; + + for(int t=0;t >, data); +}; + +void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) +{ + const int nchannel=4; + Gamma::Algebra Gammas[nchannel][2] = { + {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5}, + {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5}, + {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5}, + {Gamma::Algebra::Identity,Gamma::Algebra::Identity} + }; + + LatticeComplex meson_CF(q1.Grid()); + MesonFile MF; + + for(int ch=0;ch meson_T; + sliceSum(meson_CF,meson_T, Tdir); + + int nt=meson_T.size(); + + std::vector corr(nt); + for(int t=0;t seeds4({1,2,3,4}); + // GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu(UGrid); + std::string config; + RealD M5=atof(getenv("M5")); + RealD mq = atof(getenv("mass")); + int tadpole = atof(getenv("tadpole")); + std::vector masses({ mq} ); // u/d, s, c ?? + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + config="ColdConfig"; + // RealD P=1.0; // Don't scale + // RealD P=0.6388238 // 32Ifine + // RealD P=0.6153342; // 64I + RealD P=0.5871119; // 48I + RealD u0 = sqrt(sqrt(P)); + RealD w0 = 1 - M5; + std::cout< boundary = {1,1,1,-1}; + FermionActionR::ImplParams Params(boundary); + RealD b=1.5; + RealD c=0.5; + std::cout< PointProps(nmass,UGrid); + // std::vector FreeProps(nmass,UGrid); + // LatticePropagator delta(UGrid); + + for(int m=0;m Date: Wed, 31 Aug 2022 17:35:32 -0400 Subject: [PATCH 133/240] Used in g-2 sign off --- examples/Example_christoph.cc | 436 ++++++++++++++++++++++++++++++++++ 1 file changed, 436 insertions(+) create mode 100644 examples/Example_christoph.cc diff --git a/examples/Example_christoph.cc b/examples/Example_christoph.cc new file mode 100644 index 00000000..d60c5f98 --- /dev/null +++ b/examples/Example_christoph.cc @@ -0,0 +1,436 @@ +/* + * Warning: This code illustrative only: not well tested, and not meant for production use + * without regression / tests being applied + */ + +#include + +using namespace std; +using namespace Grid; + +RealD LLscale =1.0; +RealD LCscale =1.0; + +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +void MakePhase(Coordinate mom,LatticeComplex &phase) +{ + GridBase *grid = phase.Grid(); + auto latt_size = grid->GlobalDimensions(); + ComplexD ci(0.0,1.0); + phase=Zero(); + + LatticeComplex coor(phase.Grid()); + for(int mu=0;mu +void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) +{ + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + Integer Iterations = 40; + Real width = 2.0; + Real coeff = (width*width) / Real(4*Iterations); + + Field tmp(U.Grid()); + smeared=unsmeared; + // chi = (1-p^2/2N)^N kronecker + for(int n = 0; n < Iterations; ++n) { + Laplacian.M(smeared,tmp); + smeared = smeared - coeff*tmp; + std::cout << " smear iter " << n<<" " < +void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + GridBase *UGrid = source.Grid(); + GridBase *FGrid = D.FermionGrid(); + bool fiveD = true; //calculate 5d free propagator + RealD mass = D.Mass(); + LatticeFermion src4 (UGrid); + LatticeFermion result4 (UGrid); + LatticeFermion result5(FGrid); + LatticeFermion src5(FGrid); + LatticePropagator prop5(FGrid); + for(int s=0;s(src4,source,s,c); + + D.ImportPhysicalFermionSource(src4,src5); + D.FreePropagator(src5,result5,mass,true); + std::cout<(prop5,result5,s,c); + FermToProp(propagator,result4,s,c); + } + } + + LatticePropagator Vector_mu(UGrid); + LatticeComplex VV (UGrid); + std::vector sumVV; + Gamma::Algebra GammaV[3] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ + }; + for( int mu=0;mu<3;mu++ ) { + Gamma gV(GammaV[mu]); + D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); + VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current + sliceSum(VV,sumVV,Tdir); + int Nt = sumVV.size(); + for(int t=0;t +void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + bool fiveD = false; //calculate 4d free propagator + RealD mass = D.Mass(); + GridBase *UGrid = source.Grid(); + LatticeFermion src4 (UGrid); + LatticeFermion result4 (UGrid); + for(int s=0;s(src4,source,s,c); + D.FreePropagator(src4,result4,mass,false); + FermToProp(propagator,result4,s,c); + } + } +} + +template +void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + GridBase *UGrid = D.GaugeGrid(); + GridBase *FGrid = D.FermionGrid(); + + LatticeFermion src4 (UGrid); + LatticeFermion src5 (FGrid); + LatticeFermion result5(FGrid); + LatticeFermion result4(UGrid); + LatticePropagator prop5(FGrid); + + ConjugateGradient CG(1.0e-7,100000); + SchurRedBlackDiagMooeeSolve schur(CG); + ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors + for(int s=0;s(src4,source,s,c); + + D.ImportPhysicalFermionSource(src4,src5); + + result5=Zero(); + schur(D,src5,result5,ZG); + std::cout<(prop5,result5,s,c); + FermToProp(propagator,result4,s,c); + } + } + LatticePropagator Axial_mu(UGrid); + LatticePropagator Vector_mu(UGrid); + + LatticeComplex PA (UGrid); + LatticeComplex VV (UGrid); + LatticeComplex PJ5q(UGrid); + LatticeComplex PP (UGrid); + + std::vector sumPA; + std::vector sumVV; + std::vector sumPP; + std::vector sumPJ5q; + + Gamma g5(Gamma::Algebra::Gamma5); + D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); + PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current + sliceSum(PA,sumPA,Tdir); + + int Nt{static_cast(sumPA.size())}; + + for(int t=0;t >, data); +}; + +void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) +{ + const int nchannel=4; + Gamma::Algebra Gammas[nchannel][2] = { + {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5}, + {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5}, + {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5}, + {Gamma::Algebra::Identity,Gamma::Algebra::Identity} + }; + + LatticeComplex meson_CF(q1.Grid()); + MesonFile MF; + + for(int ch=0;ch meson_T; + sliceSum(meson_CF,meson_T, Tdir); + + int nt=meson_T.size(); + + std::vector corr(nt); + for(int t=0;t seeds4({1,2,3,4}); + // GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu(UGrid); + std::string config; + RealD M5=atof(getenv("M5")); + RealD mq = atof(getenv("mass")); + int point_x = atoi(getenv("point_x")); + int point_y = atoi(getenv("point_y")); + int point_z = atoi(getenv("point_z")); + int point_t = atoi(getenv("point_t")); + std::vector masses({ mq} ); // u/d, s, c ?? + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout< FermActs; + + std::cout< boundary = {1,1,1,-1}; + FermionActionR::ImplParams Params(boundary); + RealD b=1.5; + RealD c=0.5; + FermActs.push_back(new FermionActionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c)); + } + + LatticePropagator point_source(UGrid); + + Coordinate Origin({point_x,point_y,point_z,point_t}); + PointSource (Origin,point_source); + + std::vector PointProps(nmass,UGrid); + + for(int m=0;m Date: Wed, 31 Aug 2022 18:31:46 -0400 Subject: [PATCH 134/240] Tracing --- Grid/perfmon/Tracing.h | 66 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 Grid/perfmon/Tracing.h diff --git a/Grid/perfmon/Tracing.h b/Grid/perfmon/Tracing.h new file mode 100644 index 00000000..d39350c9 --- /dev/null +++ b/Grid/perfmon/Tracing.h @@ -0,0 +1,66 @@ +#pragma once +#ifdef GRID_TRACING_NVTX +#include +class GridTracer { +public: + GridTracer(const char* name) { + nvtxRangePushA(name); + } + ~GridTracer() { + nvtxRangePop(); + } +}; +inline void tracePush(const char *name) { nvtxRangePushA(name); } +inline void tracePop(const char *name) { nvtxRangePop(); } +inline int traceStart(const char *name) { } +inline void traceStop(int ID) { } +#endif + +#ifdef GRID_TRACING_ROCTX +#include +class GridTracer { + public: + GridTracer(const char* name) { + roctxRangePushA(name); + std::cout << "roctxRangePush "< Date: Wed, 31 Aug 2022 19:01:14 -0400 Subject: [PATCH 135/240] Warning fixes --- Grid/allocator/MemoryManagerCache.cc | 14 +++++++------- Grid/qcd/action/filters/MomentumFilter.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index f6d07582..3bb3db7e 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -110,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - mprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); @@ -118,7 +118,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -132,7 +132,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - mprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + mprintf("MemoryManager: Evict(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); if(AccCache.state==AccDirty) { @@ -143,7 +143,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -156,7 +156,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - mprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -171,7 +171,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - mprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -247,7 +247,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { - dprintf("ViewOpen found entry %llx %llx : %lld %lld\n", + dprintf("ViewOpen found entry %lx %lx : %ld %ld\n", (uint64_t)AccCache.CpuPtr, (uint64_t)CpuPtr, (uint64_t)AccCache.bytes, diff --git a/Grid/qcd/action/filters/MomentumFilter.h b/Grid/qcd/action/filters/MomentumFilter.h index 864166f5..275f2c9c 100644 --- a/Grid/qcd/action/filters/MomentumFilter.h +++ b/Grid/qcd/action/filters/MomentumFilter.h @@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid); template struct MomentumFilterBase{ virtual void applyFilter(MomentaField &P) const = 0; + virtual ~MomentumFilterBase(){}; }; //Do nothing @@ -83,7 +84,6 @@ struct MomentumFilterApplyPhase: public MomentumFilterBase{ } - }; From 1713de35c0dc339564661dd7df8a72583f889e91 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 5 Sep 2022 21:50:02 -0400 Subject: [PATCH 136/240] Improved config flags --- systems/Crusher/config-command | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index c93ea9c8..dfd8d127 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -7,7 +7,7 @@ --disable-fermion-reps \ --with-gmp=$OLCF_GMP_ROOT \ --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ ---disable-gparity \ +--enable-gparity \ CXX=hipcc MPICXX=mpicxx \ CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include " \ LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 " From 19da647e3c54a38ff65ea123fbb3aa5981dba2b3 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Fri, 9 Sep 2022 12:47:09 -0400 Subject: [PATCH 137/240] Added support for non-periodic gauge field implementations in the random gauge shift performed at the start of the HMC trajectory (The above required exposing the gauge implementation to the HMC class through the Integrator class) Made the random shift optional (default on) through a parameter in HMCparameters Modified ConjugateBC::CshiftLink such that it supports any shift in -L < shift < L rather than just +-1 Added a tester for the BC-respecting Cshift Fixed a missing system header include in SSE4 intrinsics wrapper Fixed sumD_cpu for single-prec types performing an incorrect conversion to a single-prec data type at the end, that fails to compile on some systems --- Grid/lattice/Lattice_reduction.h | 5 +- Grid/qcd/hmc/HMC.h | 43 ++-- Grid/qcd/hmc/integrators/Integrator.h | 4 +- .../hmc/integrators/Integrator_algorithm.h | 15 +- Grid/qcd/utils/CovariantCshift.h | 30 ++- Grid/qcd/utils/GaugeFix.h | 4 +- Grid/simd/Grid_sse4.h | 2 +- tests/Test_gfield_shift.cc | 183 ++++++++++++++++++ 8 files changed, 248 insertions(+), 38 deletions(-) create mode 100644 tests/Test_gfield_shift.cc diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index bcd09c04..af6f8c00 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -91,10 +91,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites) for(int i=0;i * > ObsListType; //pass these from the resource manager @@ -138,26 +142,37 @@ private: GridBase *Grid = U.Grid(); - ////////////////////////////////////////////////////////////////////////////////////////////////////// - // Mainly for DDHMC perform a random translation of U modulo volume - ////////////////////////////////////////////////////////////////////////////////////////////////////// - std::cout << GridLogMessage << "--------------------------------------------------\n"; - std::cout << GridLogMessage << "Random shifting gauge field by ["; - for(int d=0;dNd();d++) { + if(Params.PerformRandomShift){ + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Mainly for DDHMC perform a random translation of U modulo volume + ////////////////////////////////////////////////////////////////////////////////////////////////////// + std::cout << GridLogMessage << "--------------------------------------------------\n"; + std::cout << GridLogMessage << "Random shifting gauge field by ["; - int L = Grid->GlobalDimensions()[d]; + std::vector Umu(Grid->Nd(), U.Grid()); + for(int mu=0;muNd();mu++) Umu[mu] = PeekIndex(U, mu); - RealD rn_uniform; random(sRNG, rn_uniform); + for(int d=0;dNd();d++) { - int shift = (int) (rn_uniform*L); + int L = Grid->GlobalDimensions()[d]; - std::cout << shift; - if(dNd()-1) std::cout <<","; - else std::cout <<"]\n"; + RealD rn_uniform; random(sRNG, rn_uniform); + + int shift = (int) (rn_uniform*L); + + std::cout << shift; + if(dNd()-1) std::cout <<","; + else std::cout <<"]\n"; - U = Cshift(U,d,shift); + //shift all fields together in a way that respects the gauge BCs + for(int mu=0; mu < Grid->Nd(); mu++) + Umu[mu] = FieldImplementation::CshiftLink(Umu[mu],d,shift); + } + + for(int mu=0;muNd();mu++) PokeIndex(U,Umu[mu],mu); + + std::cout << GridLogMessage << "--------------------------------------------------\n"; } - std::cout << GridLogMessage << "--------------------------------------------------\n"; TheIntegrator.reset_timer(); diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 7f676ce7..f36e1f64 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -63,10 +63,10 @@ public: }; /*! @brief Class for Molecular Dynamics management */ -template +template class Integrator { protected: - + typedef FieldImplementation_ FieldImplementation; typedef typename FieldImplementation::Field MomentaField; //for readability typedef typename FieldImplementation::Field Field; diff --git a/Grid/qcd/hmc/integrators/Integrator_algorithm.h b/Grid/qcd/hmc/integrators/Integrator_algorithm.h index b05c4ea8..d8b34571 100644 --- a/Grid/qcd/hmc/integrators/Integrator_algorithm.h +++ b/Grid/qcd/hmc/integrators/Integrator_algorithm.h @@ -92,10 +92,11 @@ NAMESPACE_BEGIN(Grid); * P 1/2 P 1/2 */ -template > -class LeapFrog : public Integrator +template > +class LeapFrog : public Integrator { public: + typedef FieldImplementation_ FieldImplementation; typedef LeapFrog Algorithm; INHERIT_FIELD_TYPES(FieldImplementation); @@ -135,13 +136,14 @@ public: } }; -template > -class MinimumNorm2 : public Integrator +template > +class MinimumNorm2 : public Integrator { private: const RealD lambda = 0.1931833275037836; public: + typedef FieldImplementation_ FieldImplementation; INHERIT_FIELD_TYPES(FieldImplementation); MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet& Aset, SmearingPolicy& Sm) @@ -192,8 +194,8 @@ public: } }; -template > -class ForceGradient : public Integrator +template > +class ForceGradient : public Integrator { private: const RealD lambda = 1.0 / 6.0; @@ -202,6 +204,7 @@ private: const RealD theta = 0.0; public: + typedef FieldImplementation_ FieldImplementation; INHERIT_FIELD_TYPES(FieldImplementation); // Looks like dH scales as dt^4. tested wilson/wilson 2 level. diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index 79cf8e0f..23984145 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -227,26 +227,38 @@ namespace ConjugateBC { //shift = -1 //Out(x) = U_\mu(x-mu) | x_\mu != 0 // = U*_\mu(L-1) | x_\mu == 0 + //shift = 2 + //Out(x) = U_\mu(x+2\hat\mu) | x_\mu < L-2 + // = U*_\mu(1) | x_\mu == L-1 + // = U*_\mu(0) | x_\mu == L-2 + //shift = -2 + //Out(x) = U_\mu(x-2mu) | x_\mu > 1 + // = U*_\mu(L-2) | x_\mu == 0 + // = U*_\mu(L-1) | x_\mu == 1 + //etc template Lattice CshiftLink(const Lattice &Link, int mu, int shift) { GridBase *grid = Link.Grid(); - int Lmu = grid->GlobalDimensions()[mu] - 1; + int Lmu = grid->GlobalDimensions()[mu]; + assert(abs(shift) < Lmu && "Invalid shift value"); Lattice> coor(grid); LatticeCoordinate(coor, mu); Lattice tmp(grid); - if(shift == 1){ - tmp = Cshift(Link, mu, 1); - tmp = where(coor == Lmu, conjugate(tmp), tmp); + if(shift > 0){ + tmp = Cshift(Link, mu, shift); + tmp = where(coor >= Lmu-shift, conjugate(tmp), tmp); return tmp; - }else if(shift == -1){ + }else if(shift < 0){ tmp = Link; - tmp = where(coor == Lmu, conjugate(tmp), tmp); - return Cshift(tmp, mu, -1); - }else assert(0 && "Invalid shift value"); - return tmp; //shuts up the compiler fussing about the return type + tmp = where(coor >= Lmu+shift, conjugate(tmp), tmp); + return Cshift(tmp, mu, shift); + } + + //shift == 0 + return Link; } } diff --git a/Grid/qcd/utils/GaugeFix.h b/Grid/qcd/utils/GaugeFix.h index d9d03c54..fc723fe3 100644 --- a/Grid/qcd/utils/GaugeFix.h +++ b/Grid/qcd/utils/GaugeFix.h @@ -72,12 +72,12 @@ public: //Fix the gauge field Umu //0 < alpha < 1 is related to the step size, cf https://arxiv.org/pdf/1405.5812.pdf - static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) { + static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) { GridBase *grid = Umu.Grid(); GaugeMat xform(grid); SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge); } - static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) { + static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) { //Fix the gauge field Umu and also return the gauge transformation from the original gauge field, xform GridBase *grid = Umu.Grid(); diff --git a/Grid/simd/Grid_sse4.h b/Grid/simd/Grid_sse4.h index eb76427e..2b2a80fd 100644 --- a/Grid/simd/Grid_sse4.h +++ b/Grid/simd/Grid_sse4.h @@ -35,7 +35,7 @@ Author: neo */ // Time-stamp: <2015-06-16 23:27:54 neo> //---------------------------------------------------------------------- - +#include #include NAMESPACE_BEGIN(Grid); diff --git a/tests/Test_gfield_shift.cc b/tests/Test_gfield_shift.cc new file mode 100644 index 00000000..cf450d3c --- /dev/null +++ b/tests/Test_gfield_shift.cc @@ -0,0 +1,183 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_gfield_shift.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +//Test the shifting of the gauge field that respects the boundary conditions +#include + +using namespace Grid; + ; + +typedef ConjugateGimplR Gimpl; //can choose periodic / charge conjugate directions at wil +typedef Gimpl::GaugeField GaugeField; +typedef Gimpl::GaugeLinkField GaugeLinkField; +typedef Gimpl::SiteGaugeField SiteGaugeField; +typedef Gimpl::SiteGaugeLink SiteGaugeLink; + +GaugeField CshiftGaugeField(const GaugeField &U, const int dir, const int shift){ + GridBase *Grid = U.Grid(); + + GaugeField out(Grid); + GaugeLinkField Umu(Grid); + for(int mu=0;muNd();mu++){ + Umu = PeekIndex(U, mu); + Umu = Gimpl::CshiftLink(Umu,dir,shift); + PokeIndex(out,Umu,mu); + } + return out; +} + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + auto latt_size = GridDefaultLatt(); + auto simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + + std::vector conj_dirs = {1,1,0,0}; + Gimpl::setDirections(conj_dirs); + + GridCartesian Fine(latt_size,simd_layout,mpi_layout); + + GridParallelRNG FineRNG(&Fine); FineRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + + + GaugeField U(&Fine); + GaugeField ShiftU(&Fine); + + GaugeLinkField link_field(&Fine), link_field_2(&Fine); + + //Like Test_cshift we put the lex coordinate index on each site but make it imaginary + //so we can tell when it was complex conjugated + LatticeComplex lex(&Fine); + lex=Zero(); + U = Zero(); + { + LatticeComplex coor(&Fine); + Integer stride =1; + for(int d=0;d<4;d++){ + LatticeCoordinate(coor,d); + lex = lex + coor*stride; + stride=stride*latt_size[d]; + } + PokeIndex(link_field, lex, 0,0); //place on 0,0 element of link + + for(int mu=0;mu(U, link_field_2, mu); + } + } + + std::stringstream ss; + ss<<"error"; + for(int d=0;d 0 && coor[dir] >= latt_size[dir]-shift && conj_dirs[dir] ) + || + ( shift < 0 && coor[dir] <= -shift-1 && conj_dirs[dir] ) + ) + scm = conjugate(scm); //CC if pulled over boundary + + cm = um(mu)()(0,0); + + RealD nrm = abs(scm-cm()()()); + //std::cout << cm << " " << scm << std::endl; + + Coordinate peer(4); + Complex tmp =cm; + Integer index=real(tmp); + + Integer cm_mu = index / vol4d; + index = index % vol4d; + Lexicographic::CoorFromIndex(peer,index,latt_size); + + if (nrm > 0){ + ferr<<"FAIL mu " << mu << " shift "<< shift<<" in dir "<< dir<<" ["< Date: Fri, 23 Sep 2022 16:18:47 -0400 Subject: [PATCH 138/240] MixedPrec Multishift with better precision scheme for GPU --- .../ConjugateGradientMultiShiftMixedPrec.h | 52 +++++++------------ 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h index b1d90688..8a88cb07 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h @@ -164,12 +164,8 @@ public: RealD cp,bp,qq; //prev // Matrix mult fields - FieldF r_f(SinglePrecGrid); FieldF p_f(SinglePrecGrid); - FieldF tmp_f(SinglePrecGrid); FieldF mmp_f(SinglePrecGrid); - FieldF src_f(SinglePrecGrid); - precisionChange(src_f, src_d); // Check lightest mass for(int s=0;s &_NumOp, FermionOperator &_DenOp, const Params & p @@ -149,6 +197,11 @@ NAMESPACE_BEGIN(Grid); ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance; } + std::vector action_tolerance(ApproxHalfPowerAction.tolerances.size(),param.action_tolerance); + std::vector md_tolerance (ApproxHalfPowerMD.tolerances.size(),param.md_tolerance); + + SetTolerances(action_tolerance, md_tolerance); + std::cout< Date: Fri, 23 Sep 2022 16:22:28 -0400 Subject: [PATCH 142/240] dt info --- Grid/qcd/hmc/integrators/Integrator.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 7f676ce7..48219c04 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -145,7 +145,7 @@ protected: // DumpSliceNorm("force ",force,Nd-1); MomFilter->applyFilter(force); - std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) @@ -156,6 +156,7 @@ protected: as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max); + std::cout << GridLogIntegrator<< "["< Date: Fri, 23 Sep 2022 16:22:53 -0400 Subject: [PATCH 143/240] Better logging of Fdt for force gradient --- Grid/qcd/hmc/integrators/Integrator_algorithm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/hmc/integrators/Integrator_algorithm.h b/Grid/qcd/hmc/integrators/Integrator_algorithm.h index b05c4ea8..ad556659 100644 --- a/Grid/qcd/hmc/integrators/Integrator_algorithm.h +++ b/Grid/qcd/hmc/integrators/Integrator_algorithm.h @@ -227,7 +227,8 @@ public: // Presently 4 force evals, and should have 3, so 1.33x too expensive. // could reduce this with sloppy CG to perhaps 1.15x too expensive // even without prediction. - this->update_P(Pfg, Ufg, level, 1.0); + this->update_P(Pfg, Ufg, level, fg_dt); + Pfg = Pfg*(1.0/fg_dt); this->update_U(Pfg, Ufg, fg_dt); this->update_P(Ufg, level, ep); } From a2cefaa53afcd8a5f9e05f8e7639f17bb8bfc20c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 23 Sep 2022 16:49:14 -0400 Subject: [PATCH 144/240] Faster --- systems/Crusher/config-command | 6 ++++-- systems/Crusher/dwf.slurm | 23 +++++++++++++---------- systems/Crusher/sourceme.sh | 8 ++++++-- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index dfd8d127..854c2c01 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -6,10 +6,12 @@ --enable-simd=GPU \ --disable-fermion-reps \ --with-gmp=$OLCF_GMP_ROOT \ +--with-fftw=$FFTW_DIR/.. \ --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ --enable-gparity \ CXX=hipcc MPICXX=mpicxx \ -CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include " \ - LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 " +CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib64 " \ + LDFLAGS="-L/lib64 -L/opt/rocm-5.2.0/lib/ -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 " + diff --git a/systems/Crusher/dwf.slurm b/systems/Crusher/dwf.slurm index 757ad4a2..20239e80 100644 --- a/systems/Crusher/dwf.slurm +++ b/systems/Crusher/dwf.slurm @@ -10,23 +10,26 @@ #SBATCH -n 8 #SBATCH --exclusive #SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4 +#export BIND="--cpu-bind=verbose,map_ldom:3,3,1,1,2,2,0,0" DIR=. source sourceme.sh -export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=16384 export MPICH_GPU_SUPPORT_ENABLED=1 -export MPICH_SMP_SINGLE_COPY_MODE=XPMEM -export OMP_NUM_THREADS=1 +export MPICH_SMP_SINGLE_COPY_MODE=CMA +export OMP_NUM_THREADS=4 +export MPICH_OFI_NIC_POLICY=GPU -echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE +#PARAMS=" --accelerator-threads 8 --grid 64.64.32.16 --mpi 2.2.2.1 --comms-sequential --shm 2048 --shm-mpi 1" +#srun --gpus-per-task 1 -n8 $BIND ./wrap.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS -echo working directory -pwd +PARAMS=" --accelerator-threads 8 --grid 16.16.32.32 --mpi 1.1.1.1 --comms-sequential --shm 2048 --shm-mpi 1" +srun --gpus-per-task 1 -n1 $BIND ./wrap.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS -PARAMS=" --accelerator-threads 8 --grid 32.32.32.32 --mpi 1.1.1.1 --comms-sequential --shm 2048 --shm-mpi 0" -srun --gpus-per-task 1 -n1 ./benchmarks/Benchmark_dwf_fp32 $PARAMS +PARAMS=" --accelerator-threads 8 --grid 32.16.32.32 --mpi 1.1.1.2 --comms-sequential --shm 2048 --shm-mpi 1" +srun --gpus-per-task 1 -n2 $BIND ./wrap.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS -PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.1 --comms-sequential --shm 2048 --shm-mpi 0" -srun --gpus-per-task 1 -n8 ./benchmarks/Benchmark_dwf_fp32 $PARAMS +PARAMS=" --accelerator-threads 8 --grid 32.32.32.64 --mpi 1.2.2.2 --comms-sequential --shm 2048 --shm-mpi 1" +srun --gpus-per-task 1 -n8 $BIND ./wrap.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS diff --git a/systems/Crusher/sourceme.sh b/systems/Crusher/sourceme.sh index ad0d6582..3cccb10a 100644 --- a/systems/Crusher/sourceme.sh +++ b/systems/Crusher/sourceme.sh @@ -1,8 +1,12 @@ +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib +module load emacs +#module load gperftools module load PrgEnv-gnu -module load rocm/5.1.0 +module load rocm/5.2.0 module load cray-mpich/8.1.16 +#module load cray-mpich/8.1.17 module load gmp -#module load cray-fftw +module load cray-fftw module load craype-accel-amd-gfx90a export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH #Hack for lib From 68e4d833dd4f772f8b33ca12f45126138afea29a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 23 Sep 2022 16:49:29 -0400 Subject: [PATCH 145/240] Run through wrapper script --- systems/Crusher/wrap.sh | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100755 systems/Crusher/wrap.sh diff --git a/systems/Crusher/wrap.sh b/systems/Crusher/wrap.sh new file mode 100755 index 00000000..eb58353c --- /dev/null +++ b/systems/Crusher/wrap.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +export HIP_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES +unset ROCR_VISIBLE_DEVICES + +#rank=$SLURM_PROCID +#rocprof -d rocprof.$rank -o rocprof.$rank/results.rank$SLURM_PROCID.csv --sys-trace $@ + +$@ From 7ffbc3e98e561a2f9696a1839649ac42e9b49450 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:35:31 -0400 Subject: [PATCH 146/240] Double2 improved. REally don't like 'convertType' - localise to a GPT header --- Grid/lattice/Lattice_transfer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index de6ca886..cf6e94c2 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -194,11 +194,11 @@ accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) { #endif accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) { - out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v); + precisionChange(out,in); } accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) { - Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v); + precisionChange(out,in); } template From 6c9eef97264fb089489e5f26c79822b87b9cabea Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:36:54 -0400 Subject: [PATCH 147/240] D2 fields --- Grid/qcd/QCD.h | 85 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 27 deletions(-) diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h index 81356a66..533c0e21 100644 --- a/Grid/qcd/QCD.h +++ b/Grid/qcd/QCD.h @@ -126,6 +126,7 @@ typedef iSpinMatrix SpinMatrixD; typedef iSpinMatrix vSpinMatrix; typedef iSpinMatrix vSpinMatrixF; typedef iSpinMatrix vSpinMatrixD; +typedef iSpinMatrix vSpinMatrixD2; // Colour Matrix typedef iColourMatrix ColourMatrix; @@ -135,6 +136,7 @@ typedef iColourMatrix ColourMatrixD; typedef iColourMatrix vColourMatrix; typedef iColourMatrix vColourMatrixF; typedef iColourMatrix vColourMatrixD; +typedef iColourMatrix vColourMatrixD2; // SpinColour matrix typedef iSpinColourMatrix SpinColourMatrix; @@ -144,6 +146,7 @@ typedef iSpinColourMatrix SpinColourMatrixD; typedef iSpinColourMatrix vSpinColourMatrix; typedef iSpinColourMatrix vSpinColourMatrixF; typedef iSpinColourMatrix vSpinColourMatrixD; +typedef iSpinColourMatrix vSpinColourMatrixD2; // SpinColourSpinColour matrix typedef iSpinColourSpinColourMatrix SpinColourSpinColourMatrix; @@ -153,6 +156,7 @@ typedef iSpinColourSpinColourMatrix SpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix vSpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix vSpinColourSpinColourMatrixD; +typedef iSpinColourSpinColourMatrix vSpinColourSpinColourMatrixD2; // SpinColourSpinColour matrix typedef iSpinColourSpinColourMatrix SpinColourSpinColourMatrix; @@ -162,33 +166,37 @@ typedef iSpinColourSpinColourMatrix SpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix vSpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix vSpinColourSpinColourMatrixD; +typedef iSpinColourSpinColourMatrix vSpinColourSpinColourMatrixD2; // LorentzColour typedef iLorentzColourMatrix LorentzColourMatrix; typedef iLorentzColourMatrix LorentzColourMatrixF; typedef iLorentzColourMatrix LorentzColourMatrixD; -typedef iLorentzColourMatrix vLorentzColourMatrix; -typedef iLorentzColourMatrix vLorentzColourMatrixF; -typedef iLorentzColourMatrix vLorentzColourMatrixD; +typedef iLorentzColourMatrix vLorentzColourMatrix; +typedef iLorentzColourMatrix vLorentzColourMatrixF; +typedef iLorentzColourMatrix vLorentzColourMatrixD; +typedef iLorentzColourMatrix vLorentzColourMatrixD2; // DoubleStored gauge field typedef iDoubleStoredColourMatrix DoubleStoredColourMatrix; typedef iDoubleStoredColourMatrix DoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix DoubleStoredColourMatrixD; -typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrix; -typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixF; -typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixD; +typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrix; +typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixF; +typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixD; +typedef iDoubleStoredColourMatrix vDoubleStoredColourMatrixD2; //G-parity flavour matrix typedef iGparityFlavourMatrix GparityFlavourMatrix; typedef iGparityFlavourMatrix GparityFlavourMatrixF; typedef iGparityFlavourMatrix GparityFlavourMatrixD; -typedef iGparityFlavourMatrix vGparityFlavourMatrix; -typedef iGparityFlavourMatrix vGparityFlavourMatrixF; -typedef iGparityFlavourMatrix vGparityFlavourMatrixD; +typedef iGparityFlavourMatrix vGparityFlavourMatrix; +typedef iGparityFlavourMatrix vGparityFlavourMatrixF; +typedef iGparityFlavourMatrix vGparityFlavourMatrixD; +typedef iGparityFlavourMatrix vGparityFlavourMatrixD2; // Spin vector @@ -199,6 +207,7 @@ typedef iSpinVector SpinVectorD; typedef iSpinVector vSpinVector; typedef iSpinVector vSpinVectorF; typedef iSpinVector vSpinVectorD; +typedef iSpinVector vSpinVectorD2; // Colour vector typedef iColourVector ColourVector; @@ -208,6 +217,7 @@ typedef iColourVector ColourVectorD; typedef iColourVector vColourVector; typedef iColourVector vColourVectorF; typedef iColourVector vColourVectorD; +typedef iColourVector vColourVectorD2; // SpinColourVector typedef iSpinColourVector SpinColourVector; @@ -217,6 +227,7 @@ typedef iSpinColourVector SpinColourVectorD; typedef iSpinColourVector vSpinColourVector; typedef iSpinColourVector vSpinColourVectorF; typedef iSpinColourVector vSpinColourVectorD; +typedef iSpinColourVector vSpinColourVectorD2; // HalfSpin vector typedef iHalfSpinVector HalfSpinVector; @@ -226,15 +237,17 @@ typedef iHalfSpinVector HalfSpinVectorD; typedef iHalfSpinVector vHalfSpinVector; typedef iHalfSpinVector vHalfSpinVectorF; typedef iHalfSpinVector vHalfSpinVectorD; +typedef iHalfSpinVector vHalfSpinVectorD2; // HalfSpinColour vector typedef iHalfSpinColourVector HalfSpinColourVector; typedef iHalfSpinColourVector HalfSpinColourVectorF; typedef iHalfSpinColourVector HalfSpinColourVectorD; -typedef iHalfSpinColourVector vHalfSpinColourVector; -typedef iHalfSpinColourVector vHalfSpinColourVectorF; -typedef iHalfSpinColourVector vHalfSpinColourVectorD; +typedef iHalfSpinColourVector vHalfSpinColourVector; +typedef iHalfSpinColourVector vHalfSpinColourVectorF; +typedef iHalfSpinColourVector vHalfSpinColourVectorD; +typedef iHalfSpinColourVector vHalfSpinColourVectorD2; //G-parity flavour vector typedef iGparityFlavourVector GparityFlavourVector; @@ -244,7 +257,7 @@ typedef iGparityFlavourVector GparityFlavourVectorD; typedef iGparityFlavourVector vGparityFlavourVector; typedef iGparityFlavourVector vGparityFlavourVectorF; typedef iGparityFlavourVector vGparityFlavourVectorD; - +typedef iGparityFlavourVector vGparityFlavourVectorD2; // singlets typedef iSinglet TComplex; // FIXME This is painful. Tensor singlet complex type. @@ -254,6 +267,7 @@ typedef iSinglet TComplexD; // FIXME This is painful. Tenso typedef iSinglet vTComplex ; // what if we don't know the tensor structure typedef iSinglet vTComplexF; // what if we don't know the tensor structure typedef iSinglet vTComplexD; // what if we don't know the tensor structure +typedef iSinglet vTComplexD2; // what if we don't know the tensor structure typedef iSinglet TReal; // Shouldn't need these; can I make it work without? typedef iSinglet TRealF; // Shouldn't need these; can I make it work without? @@ -271,47 +285,58 @@ typedef iSinglet TInteger; typedef Lattice LatticeColourMatrix; typedef Lattice LatticeColourMatrixF; typedef Lattice LatticeColourMatrixD; +typedef Lattice LatticeColourMatrixD2; typedef Lattice LatticeSpinMatrix; typedef Lattice LatticeSpinMatrixF; typedef Lattice LatticeSpinMatrixD; +typedef Lattice LatticeSpinMatrixD2; typedef Lattice LatticeSpinColourMatrix; typedef Lattice LatticeSpinColourMatrixF; typedef Lattice LatticeSpinColourMatrixD; +typedef Lattice LatticeSpinColourMatrixD2; typedef Lattice LatticeSpinColourSpinColourMatrix; typedef Lattice LatticeSpinColourSpinColourMatrixF; typedef Lattice LatticeSpinColourSpinColourMatrixD; +typedef Lattice LatticeSpinColourSpinColourMatrixD2; -typedef Lattice LatticeLorentzColourMatrix; -typedef Lattice LatticeLorentzColourMatrixF; -typedef Lattice LatticeLorentzColourMatrixD; +typedef Lattice LatticeLorentzColourMatrix; +typedef Lattice LatticeLorentzColourMatrixF; +typedef Lattice LatticeLorentzColourMatrixD; +typedef Lattice LatticeLorentzColourMatrixD2; // DoubleStored gauge field -typedef Lattice LatticeDoubleStoredColourMatrix; -typedef Lattice LatticeDoubleStoredColourMatrixF; -typedef Lattice LatticeDoubleStoredColourMatrixD; +typedef Lattice LatticeDoubleStoredColourMatrix; +typedef Lattice LatticeDoubleStoredColourMatrixF; +typedef Lattice LatticeDoubleStoredColourMatrixD; +typedef Lattice LatticeDoubleStoredColourMatrixD2; typedef Lattice LatticeSpinVector; typedef Lattice LatticeSpinVectorF; typedef Lattice LatticeSpinVectorD; +typedef Lattice LatticeSpinVectorD2; typedef Lattice LatticeColourVector; typedef Lattice LatticeColourVectorF; typedef Lattice LatticeColourVectorD; +typedef Lattice LatticeColourVectorD2; typedef Lattice LatticeSpinColourVector; typedef Lattice LatticeSpinColourVectorF; typedef Lattice LatticeSpinColourVectorD; +typedef Lattice LatticeSpinColourVectorD2; typedef Lattice LatticeHalfSpinVector; typedef Lattice LatticeHalfSpinVectorF; typedef Lattice LatticeHalfSpinVectorD; +typedef Lattice LatticeHalfSpinVectorD2; -typedef Lattice LatticeHalfSpinColourVector; -typedef Lattice LatticeHalfSpinColourVectorF; -typedef Lattice LatticeHalfSpinColourVectorD; +typedef Lattice LatticeHalfSpinColourVector; +typedef Lattice LatticeHalfSpinColourVectorF; +typedef Lattice LatticeHalfSpinColourVectorD; +typedef Lattice LatticeHalfSpinColourVectorD2; typedef Lattice LatticeReal; typedef Lattice LatticeRealF; @@ -320,6 +345,7 @@ typedef Lattice LatticeRealD; typedef Lattice LatticeComplex; typedef Lattice LatticeComplexF; typedef Lattice LatticeComplexD; +typedef Lattice LatticeComplexD2; typedef Lattice LatticeInteger; // Predicates for "where" @@ -327,37 +353,42 @@ typedef Lattice LatticeInteger; // Predicates for "where" /////////////////////////////////////////// // Physical names for things /////////////////////////////////////////// -typedef LatticeHalfSpinColourVector LatticeHalfFermion; -typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF; -typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD; +typedef LatticeHalfSpinColourVector LatticeHalfFermion; +typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF; +typedef LatticeHalfSpinColourVectorD LatticeHalfFermionD; +typedef LatticeHalfSpinColourVectorD2 LatticeHalfFermionD2; typedef LatticeSpinColourVector LatticeFermion; typedef LatticeSpinColourVectorF LatticeFermionF; typedef LatticeSpinColourVectorD LatticeFermionD; +typedef LatticeSpinColourVectorD2 LatticeFermionD2; typedef LatticeSpinColourMatrix LatticePropagator; typedef LatticeSpinColourMatrixF LatticePropagatorF; typedef LatticeSpinColourMatrixD LatticePropagatorD; +typedef LatticeSpinColourMatrixD2 LatticePropagatorD2; typedef LatticeLorentzColourMatrix LatticeGaugeField; typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF; typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD; +typedef LatticeLorentzColourMatrixD2 LatticeGaugeFieldD2; typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField; typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF; typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD; +typedef LatticeDoubleStoredColourMatrixD2 LatticeDoubledGaugeFieldD2; template using LorentzScalar = Lattice >; -// Uhgg... typing this hurt ;) -// (my keyboard got burning hot when I typed this, must be the anti-Fermion) typedef Lattice LatticeStaggeredFermion; typedef Lattice LatticeStaggeredFermionF; typedef Lattice LatticeStaggeredFermionD; +typedef Lattice LatticeStaggeredFermionD2; typedef Lattice LatticeStaggeredPropagator; typedef Lattice LatticeStaggeredPropagatorF; typedef Lattice LatticeStaggeredPropagatorD; +typedef Lattice LatticeStaggeredPropagatorD2; ////////////////////////////////////////////////////////////////////////////// // Peek and Poke named after physics attributes From 9e81b42981b34484a75d1f2051b2ff3ffc01c14d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:37:19 -0400 Subject: [PATCH 148/240] D2 fields --- Grid/qcd/action/fermion/CloverHelpers.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h index 57e71998..e93f3dd0 100644 --- a/Grid/qcd/action/fermion/CloverHelpers.h +++ b/Grid/qcd/action/fermion/CloverHelpers.h @@ -140,6 +140,7 @@ public: return NMAX; } + static int getNMAX(Lattice> &t, RealD R) {return getNMAX(1e-12,R);} static int getNMAX(Lattice> &t, RealD R) {return getNMAX(1e-12,R);} static int getNMAX(Lattice> &t, RealD R) {return getNMAX(1e-6,R);} From e8bfbf2f7c14165cea962d73963281b657f99bd0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:37:45 -0400 Subject: [PATCH 149/240] D2 operators --- Grid/qcd/action/fermion/Fermion.h | 111 +++++++----------------------- 1 file changed, 24 insertions(+), 87 deletions(-) diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index 223ff9dd..2f3fb9f2 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -112,28 +112,21 @@ NAMESPACE_CHECK(DWFutils); // Cayley 5d NAMESPACE_BEGIN(Grid); -typedef WilsonFermion WilsonFermionR; +typedef WilsonFermion WilsonFermionD2; typedef WilsonFermion WilsonFermionF; typedef WilsonFermion WilsonFermionD; -//typedef WilsonFermion WilsonFermionRL; -//typedef WilsonFermion WilsonFermionFH; -//typedef WilsonFermion WilsonFermionDF; - -typedef WilsonFermion WilsonAdjFermionR; typedef WilsonFermion WilsonAdjFermionF; typedef WilsonFermion WilsonAdjFermionD; -typedef WilsonFermion WilsonTwoIndexSymmetricFermionR; typedef WilsonFermion WilsonTwoIndexSymmetricFermionF; typedef WilsonFermion WilsonTwoIndexSymmetricFermionD; -typedef WilsonFermion WilsonTwoIndexAntiSymmetricFermionR; typedef WilsonFermion WilsonTwoIndexAntiSymmetricFermionF; typedef WilsonFermion WilsonTwoIndexAntiSymmetricFermionD; // Twisted mass fermion -typedef WilsonTMFermion WilsonTMFermionR; +typedef WilsonTMFermion WilsonTMFermionD2; typedef WilsonTMFermion WilsonTMFermionF; typedef WilsonTMFermion WilsonTMFermionD; @@ -141,23 +134,20 @@ typedef WilsonTMFermion WilsonTMFermionD; template using WilsonClover = WilsonCloverFermion>; template using WilsonExpClover = WilsonCloverFermion>; -typedef WilsonClover WilsonCloverFermionR; +typedef WilsonClover WilsonCloverFermionD2; typedef WilsonClover WilsonCloverFermionF; typedef WilsonClover WilsonCloverFermionD; -typedef WilsonExpClover WilsonExpCloverFermionR; +typedef WilsonExpClover WilsonExpCloverFermionD2; typedef WilsonExpClover WilsonExpCloverFermionF; typedef WilsonExpClover WilsonExpCloverFermionD; -typedef WilsonClover WilsonCloverAdjFermionR; typedef WilsonClover WilsonCloverAdjFermionF; typedef WilsonClover WilsonCloverAdjFermionD; -typedef WilsonClover WilsonCloverTwoIndexSymmetricFermionR; typedef WilsonClover WilsonCloverTwoIndexSymmetricFermionF; typedef WilsonClover WilsonCloverTwoIndexSymmetricFermionD; -typedef WilsonClover WilsonCloverTwoIndexAntiSymmetricFermionR; typedef WilsonClover WilsonCloverTwoIndexAntiSymmetricFermionF; typedef WilsonClover WilsonCloverTwoIndexAntiSymmetricFermionD; @@ -165,161 +155,108 @@ typedef WilsonClover WilsonCloverTwoIndexAntiS template using CompactWilsonClover = CompactWilsonCloverFermion>; template using CompactWilsonExpClover = CompactWilsonCloverFermion>; -typedef CompactWilsonClover CompactWilsonCloverFermionR; +typedef CompactWilsonClover CompactWilsonCloverFermionD2; typedef CompactWilsonClover CompactWilsonCloverFermionF; typedef CompactWilsonClover CompactWilsonCloverFermionD; -typedef CompactWilsonExpClover CompactWilsonExpCloverFermionR; +typedef CompactWilsonExpClover CompactWilsonExpCloverFermionD2; typedef CompactWilsonExpClover CompactWilsonExpCloverFermionF; typedef CompactWilsonExpClover CompactWilsonExpCloverFermionD; -typedef CompactWilsonClover CompactWilsonCloverAdjFermionR; typedef CompactWilsonClover CompactWilsonCloverAdjFermionF; typedef CompactWilsonClover CompactWilsonCloverAdjFermionD; -typedef CompactWilsonClover CompactWilsonCloverTwoIndexSymmetricFermionR; typedef CompactWilsonClover CompactWilsonCloverTwoIndexSymmetricFermionF; typedef CompactWilsonClover CompactWilsonCloverTwoIndexSymmetricFermionD; -typedef CompactWilsonClover CompactWilsonCloverTwoIndexAntiSymmetricFermionR; typedef CompactWilsonClover CompactWilsonCloverTwoIndexAntiSymmetricFermionF; typedef CompactWilsonClover CompactWilsonCloverTwoIndexAntiSymmetricFermionD; // Domain Wall fermions -typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; typedef DomainWallFermion DomainWallFermionD; +typedef DomainWallFermion DomainWallFermionD2; -//typedef DomainWallFermion DomainWallFermionRL; -//typedef DomainWallFermion DomainWallFermionFH; -//typedef DomainWallFermion DomainWallFermionDF; - -typedef DomainWallEOFAFermion DomainWallEOFAFermionR; +typedef DomainWallEOFAFermion DomainWallEOFAFermionD2; typedef DomainWallEOFAFermion DomainWallEOFAFermionF; typedef DomainWallEOFAFermion DomainWallEOFAFermionD; -//typedef DomainWallEOFAFermion DomainWallEOFAFermionRL; -//typedef DomainWallEOFAFermion DomainWallEOFAFermionFH; -//typedef DomainWallEOFAFermion DomainWallEOFAFermionDF; - -typedef MobiusFermion MobiusFermionR; +typedef MobiusFermion MobiusFermionD2; typedef MobiusFermion MobiusFermionF; typedef MobiusFermion MobiusFermionD; -//typedef MobiusFermion MobiusFermionRL; -//typedef MobiusFermion MobiusFermionFH; -//typedef MobiusFermion MobiusFermionDF; - -typedef MobiusEOFAFermion MobiusEOFAFermionR; +typedef MobiusEOFAFermion MobiusEOFAFermionD2; typedef MobiusEOFAFermion MobiusEOFAFermionF; typedef MobiusEOFAFermion MobiusEOFAFermionD; -//typedef MobiusEOFAFermion MobiusEOFAFermionRL; -//typedef MobiusEOFAFermion MobiusEOFAFermionFH; -//typedef MobiusEOFAFermion MobiusEOFAFermionDF; - -typedef ZMobiusFermion ZMobiusFermionR; +typedef ZMobiusFermion ZMobiusFermionD2; typedef ZMobiusFermion ZMobiusFermionF; typedef ZMobiusFermion ZMobiusFermionD; -//typedef ZMobiusFermion ZMobiusFermionRL; -//typedef ZMobiusFermion ZMobiusFermionFH; -//typedef ZMobiusFermion ZMobiusFermionDF; - -// Ls vectorised -typedef ScaledShamirFermion ScaledShamirFermionR; +typedef ScaledShamirFermion ScaledShamirFermionD2; typedef ScaledShamirFermion ScaledShamirFermionF; typedef ScaledShamirFermion ScaledShamirFermionD; -typedef MobiusZolotarevFermion MobiusZolotarevFermionR; +typedef MobiusZolotarevFermion MobiusZolotarevFermionD2; typedef MobiusZolotarevFermion MobiusZolotarevFermionF; typedef MobiusZolotarevFermion MobiusZolotarevFermionD; -typedef ShamirZolotarevFermion ShamirZolotarevFermionR; +typedef ShamirZolotarevFermion ShamirZolotarevFermionD2; typedef ShamirZolotarevFermion ShamirZolotarevFermionF; typedef ShamirZolotarevFermion ShamirZolotarevFermionD; -typedef OverlapWilsonCayleyTanhFermion OverlapWilsonCayleyTanhFermionR; +typedef OverlapWilsonCayleyTanhFermion OverlapWilsonCayleyTanhFermionD2; typedef OverlapWilsonCayleyTanhFermion OverlapWilsonCayleyTanhFermionF; typedef OverlapWilsonCayleyTanhFermion OverlapWilsonCayleyTanhFermionD; -typedef OverlapWilsonCayleyZolotarevFermion OverlapWilsonCayleyZolotarevFermionR; +typedef OverlapWilsonCayleyZolotarevFermion OverlapWilsonCayleyZolotarevFermionD2; typedef OverlapWilsonCayleyZolotarevFermion OverlapWilsonCayleyZolotarevFermionF; typedef OverlapWilsonCayleyZolotarevFermion OverlapWilsonCayleyZolotarevFermionD; // Continued fraction -typedef OverlapWilsonContFracTanhFermion OverlapWilsonContFracTanhFermionR; +typedef OverlapWilsonContFracTanhFermion OverlapWilsonContFracTanhFermionD2; typedef OverlapWilsonContFracTanhFermion OverlapWilsonContFracTanhFermionF; typedef OverlapWilsonContFracTanhFermion OverlapWilsonContFracTanhFermionD; -typedef OverlapWilsonContFracZolotarevFermion OverlapWilsonContFracZolotarevFermionR; +typedef OverlapWilsonContFracZolotarevFermion OverlapWilsonContFracZolotarevFermionD2; typedef OverlapWilsonContFracZolotarevFermion OverlapWilsonContFracZolotarevFermionF; typedef OverlapWilsonContFracZolotarevFermion OverlapWilsonContFracZolotarevFermionD; // Partial fraction -typedef OverlapWilsonPartialFractionTanhFermion OverlapWilsonPartialFractionTanhFermionR; +typedef OverlapWilsonPartialFractionTanhFermion OverlapWilsonPartialFractionTanhFermionD2; typedef OverlapWilsonPartialFractionTanhFermion OverlapWilsonPartialFractionTanhFermionF; typedef OverlapWilsonPartialFractionTanhFermion OverlapWilsonPartialFractionTanhFermionD; -typedef OverlapWilsonPartialFractionZolotarevFermion OverlapWilsonPartialFractionZolotarevFermionR; +typedef OverlapWilsonPartialFractionZolotarevFermion OverlapWilsonPartialFractionZolotarevFermionD2; typedef OverlapWilsonPartialFractionZolotarevFermion OverlapWilsonPartialFractionZolotarevFermionF; typedef OverlapWilsonPartialFractionZolotarevFermion OverlapWilsonPartialFractionZolotarevFermionD; // Gparity cases; partial list until tested -typedef WilsonFermion GparityWilsonFermionR; typedef WilsonFermion GparityWilsonFermionF; typedef WilsonFermion GparityWilsonFermionD; -//typedef WilsonFermion GparityWilsonFermionRL; -//typedef WilsonFermion GparityWilsonFermionFH; -//typedef WilsonFermion GparityWilsonFermionDF; - -typedef DomainWallFermion GparityDomainWallFermionR; typedef DomainWallFermion GparityDomainWallFermionF; typedef DomainWallFermion GparityDomainWallFermionD; -//typedef DomainWallFermion GparityDomainWallFermionRL; -//typedef DomainWallFermion GparityDomainWallFermionFH; -//typedef DomainWallFermion GparityDomainWallFermionDF; - -typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionR; +typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionD2; typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionF; typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionD; -//typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionRL; -//typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionFH; -//typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionDF; - -typedef WilsonTMFermion GparityWilsonTMFermionR; +typedef WilsonTMFermion GparityWilsonTMFermionD2; typedef WilsonTMFermion GparityWilsonTMFermionF; typedef WilsonTMFermion GparityWilsonTMFermionD; -//typedef WilsonTMFermion GparityWilsonTMFermionRL; -//typedef WilsonTMFermion GparityWilsonTMFermionFH; -//typedef WilsonTMFermion GparityWilsonTMFermionDF; - -typedef MobiusFermion GparityMobiusFermionR; +typedef MobiusFermion GparityMobiusFermionD2; typedef MobiusFermion GparityMobiusFermionF; typedef MobiusFermion GparityMobiusFermionD; -//typedef MobiusFermion GparityMobiusFermionRL; -//typedef MobiusFermion GparityMobiusFermionFH; -//typedef MobiusFermion GparityMobiusFermionDF; - -typedef MobiusEOFAFermion GparityMobiusEOFAFermionR; +typedef MobiusEOFAFermion GparityMobiusEOFAFermionD2; typedef MobiusEOFAFermion GparityMobiusEOFAFermionF; typedef MobiusEOFAFermion GparityMobiusEOFAFermionD; -//typedef MobiusEOFAFermion GparityMobiusEOFAFermionRL; -//typedef MobiusEOFAFermion GparityMobiusEOFAFermionFH; -//typedef MobiusEOFAFermion GparityMobiusEOFAFermionDF; - -typedef ImprovedStaggeredFermion ImprovedStaggeredFermionR; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionD; -typedef NaiveStaggeredFermion NaiveStaggeredFermionR; typedef NaiveStaggeredFermion NaiveStaggeredFermionF; typedef NaiveStaggeredFermion NaiveStaggeredFermionD; -typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DD; From 8f4e2ee545ac138152112f20d75b31cbc22cf77f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:53:46 -0400 Subject: [PATCH 150/240] Double2 --- Grid/qcd/action/fermion/WilsonImpl.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index 2685796d..c7180115 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -37,7 +37,7 @@ NAMESPACE_BEGIN(Grid); template class WilsonImpl : public PeriodicGaugeImpl > { public: - + static const int Dimension = Representation::Dimension; static const bool isFundamental = Representation::isFundamental; static const bool LsVectorised=false; @@ -242,19 +242,13 @@ public: typedef WilsonImpl WilsonImplR; // Real.. whichever prec typedef WilsonImpl WilsonImplF; // Float typedef WilsonImpl WilsonImplD; // Double - -//typedef WilsonImpl WilsonImplRL; // Real.. whichever prec -//typedef WilsonImpl WilsonImplFH; // Float -//typedef WilsonImpl WilsonImplDF; // Double +typedef WilsonImpl WilsonImplD2; // Double typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec typedef WilsonImpl ZWilsonImplF; // Float typedef WilsonImpl ZWilsonImplD; // Double +typedef WilsonImpl ZWilsonImplD2; // Double -//typedef WilsonImpl ZWilsonImplRL; // Real.. whichever prec -//typedef WilsonImpl ZWilsonImplFH; // Float -//typedef WilsonImpl ZWilsonImplDF; // Double - typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec typedef WilsonImpl WilsonAdjImplF; // Float typedef WilsonImpl WilsonAdjImplD; // Double From 70c83ec3bec087184308ccbe6d5006260e2c3f9a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:54:23 -0400 Subject: [PATCH 151/240] More instantiations --- .../action/fermion/instantiation/generate_instantiations.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh index d6845e75..4ccc01e8 100755 --- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh +++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh @@ -9,6 +9,7 @@ STAG5_IMPL_LIST="" WILSON_IMPL_LIST=" \ WilsonImplF \ WilsonImplD \ + WilsonImplD2 \ WilsonAdjImplF \ WilsonAdjImplD \ WilsonTwoIndexSymmetricImplF \ @@ -25,8 +26,9 @@ COMPACT_WILSON_IMPL_LIST=" \ DWF_IMPL_LIST=" \ WilsonImplF \ WilsonImplD \ + WilsonImplD2 \ ZWilsonImplF \ - ZWilsonImplD " + ZWilsonImplD2 " GDWF_IMPL_LIST=" \ GparityWilsonImplF \ From 97448a93dc75313d076c85edf6de2fae1e5507bc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:55:25 -0400 Subject: [PATCH 152/240] Double2 compiles and dslash runs --- Grid/simd/Grid_vector_types.h | 24 ++++++------ Grid/simd/Grid_vector_unops.h | 74 +++++++++++++++++++++++++++++++---- Grid/simd/Simd.h | 10 ++--- 3 files changed, 82 insertions(+), 26 deletions(-) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 4f952bb2..fd71a84a 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -1055,7 +1055,7 @@ accelerator_inline typename toComplexMapper::Complexified toComplex(const } -accelerator_inline void precisionChange(vRealF *out,vRealD *in,int nvec) +accelerator_inline void precisionChange(vRealF *out,const vRealD *in,int nvec) { assert((nvec&0x1)==0); for(int m=0;m*2 @@ -112,6 +111,9 @@ template struct ImagFunctor { accelerator scalar operator()(const scalar &a) const { return imag(a); } }; +///////////// +// Unary operations +///////////// template accelerator_inline Grid_simd real(const Grid_simd &r) { return SimdApply(RealFunctor(), r); @@ -168,6 +170,65 @@ template accelerator_inline Grid_simd div(const Grid_simd &r, Integer y) { return SimdApply(DivIntFunctor(y), r); } +/// Double 2 cases +template +accelerator_inline Grid_simd2 real(const Grid_simd2 &r) { + return SimdApply(RealFunctor(), r); +} +template +accelerator_inline Grid_simd2 imag(const Grid_simd2 &r) { + return SimdApply(ImagFunctor(), r); +} +template +accelerator_inline Grid_simd2 sqrt(const Grid_simd2 &r) { + return SimdApply(SqrtRealFunctor(), r); +} +template +accelerator_inline Grid_simd2 cos(const Grid_simd2 &r) { + return SimdApply(CosRealFunctor(), r); +} +template +accelerator_inline Grid_simd2 sin(const Grid_simd2 &r) { + return SimdApply(SinRealFunctor(), r); +} +template +accelerator_inline Grid_simd2 acos(const Grid_simd2 &r) { + return SimdApply(AcosRealFunctor(), r); +} +template +accelerator_inline Grid_simd2 asin(const Grid_simd2 &r) { + return SimdApply(AsinRealFunctor(), r); +} +template +accelerator_inline Grid_simd2 log(const Grid_simd2 &r) { + return SimdApply(LogRealFunctor(), r); +} +template +accelerator_inline Grid_simd2 abs(const Grid_simd2 &r) { + return SimdApply(AbsRealFunctor(), r); +} +template +accelerator_inline Grid_simd2 exp(const Grid_simd2 &r) { + return SimdApply(ExpFunctor(), r); +} +template +accelerator_inline Grid_simd2 Not(const Grid_simd2 &r) { + return SimdApply(NotFunctor(), r); +} +template +accelerator_inline Grid_simd2 pow(const Grid_simd2 &r, double y) { + return SimdApply(PowRealFunctor(y), r); +} +template +accelerator_inline Grid_simd2 mod(const Grid_simd2 &r, Integer y) { + return SimdApply(ModIntFunctor(y), r); +} +template +accelerator_inline Grid_simd2 div(const Grid_simd2 &r, Integer y) { + return SimdApply(DivIntFunctor(y), r); +} + + //////////////////////////////////////////////////////////////////////////// // Allows us to assign into **conformable** real vectors from complex //////////////////////////////////////////////////////////////////////////// @@ -193,23 +254,22 @@ struct OrOrFunctor { //////////////////////////////// template accelerator_inline Grid_simd operator&(const Grid_simd &x, - const Grid_simd &y) { + const Grid_simd &y) { return SimdApplyBinop(AndFunctor(), x, y); } template accelerator_inline Grid_simd operator&&(const Grid_simd &x, - const Grid_simd &y) { + const Grid_simd &y) { return SimdApplyBinop(AndAndFunctor(), x, y); } template accelerator_inline Grid_simd operator|(const Grid_simd &x, - const Grid_simd &y) { + const Grid_simd &y) { return SimdApplyBinop(OrFunctor(), x, y); } template accelerator_inline Grid_simd operator||(const Grid_simd &x, - const Grid_simd &y) { + const Grid_simd &y) { return SimdApplyBinop(OrOrFunctor(), x, y); } NAMESPACE_END(Grid); -#endif diff --git a/Grid/simd/Simd.h b/Grid/simd/Simd.h index 76ca3bef..ddee6a36 100644 --- a/Grid/simd/Simd.h +++ b/Grid/simd/Simd.h @@ -224,18 +224,14 @@ accelerator_inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm); NAMESPACE_END(Grid); #include +#include #include NAMESPACE_BEGIN(Grid); -// Default precision -#ifdef GRID_DEFAULT_PRECISION_DOUBLE + +// Default precision is wired to double typedef vRealD vReal; typedef vComplexD vComplex; -#else -typedef vRealF vReal; -typedef vComplexF vComplex; -#endif - inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){ int nn=vComplexF::Nsimd(); From 234324599e407e9084665702391fc154c882e0b6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:56:10 -0400 Subject: [PATCH 153/240] Double2 --- Grid/tensors/Tensor_inner.h | 20 ++++----- Grid/tensors/Tensor_traits.h | 81 +++++------------------------------- 2 files changed, 18 insertions(+), 83 deletions(-) diff --git a/Grid/tensors/Tensor_inner.h b/Grid/tensors/Tensor_inner.h index fd651cae..05f265d7 100644 --- a/Grid/tensors/Tensor_inner.h +++ b/Grid/tensors/Tensor_inner.h @@ -214,24 +214,20 @@ accelerator_inline vRealD innerProductD2(const vRealD &l,const vRealD & accelerator_inline vComplexD2 innerProductD2(const vComplexF &l,const vComplexF &r) { - vComplexD la,lb; - vComplexD ra,rb; - Optimization::PrecisionChange::StoD(l.v,la.v,lb.v); - Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v); + vComplexD2 dl,dr; vComplexD2 ret; - ret._internal[0] = innerProduct(la,ra); - ret._internal[1] = innerProduct(lb,rb); + precisionChange(dl,l); + precisionChange(dr,r); + ret = innerProduct(dl,dr); return ret; } accelerator_inline vRealD2 innerProductD2(const vRealF &l,const vRealF &r) { - vRealD la,lb; - vRealD ra,rb; - Optimization::PrecisionChange::StoD(l.v,la.v,lb.v); - Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v); + vRealD2 dl,dr; vRealD2 ret; - ret._internal[0]=innerProduct(la,ra); - ret._internal[1]=innerProduct(lb,rb); + precisionChange(dl,l); + precisionChange(dr,r); + ret=innerProduct(dl,dr); return ret; } diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 89290214..99633cee 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -42,39 +42,6 @@ NAMESPACE_BEGIN(Grid); template struct isGridScalar : public std::false_type { static constexpr bool notvalue = true; }; template struct isGridScalar> : public std::true_type { static constexpr bool notvalue = false; }; - // Store double-precision data in single-precision grids for precision promoted localInnerProductD - template - class TypePair { - public: - T _internal[2]; - accelerator TypePair& operator=(const Grid::Zero& o) { - _internal[0] = Zero(); - _internal[1] = Zero(); - return *this; - } - - accelerator TypePair operator+(const TypePair& o) const { - TypePair r; - r._internal[0] = _internal[0] + o._internal[0]; - r._internal[1] = _internal[1] + o._internal[1]; - return r; - } - - accelerator TypePair& operator+=(const TypePair& o) { - _internal[0] += o._internal[0]; - _internal[1] += o._internal[1]; - return *this; - } - - friend accelerator_inline void add(TypePair* ret, const TypePair* a, const TypePair* b) { - add(&ret->_internal[0],&a->_internal[0],&b->_internal[0]); - add(&ret->_internal[1],&a->_internal[1],&b->_internal[1]); - } - }; - typedef TypePair ComplexD2; - typedef TypePair RealD2; - typedef TypePair vComplexD2; - typedef TypePair vRealD2; // Traits to identify fundamental data types template struct isGridFundamental : public std::false_type { static constexpr bool notvalue = true; }; @@ -88,8 +55,6 @@ NAMESPACE_BEGIN(Grid); template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; - template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; - template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; ////////////////////////////////////////////////////////////////////////////////// @@ -136,7 +101,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexF Complexified; typedef RealF Realified; typedef RealD DoublePrecision; - typedef RealD2 DoublePrecision2; + typedef RealD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef RealD scalar_type; @@ -151,19 +116,6 @@ NAMESPACE_BEGIN(Grid); typedef RealD DoublePrecision; typedef RealD DoublePrecision2; }; - template<> struct GridTypeMapper : public GridTypeMapper_Base { - typedef RealD2 scalar_type; - typedef RealD2 scalar_typeD; - typedef RealD2 vector_type; - typedef RealD2 vector_typeD; - typedef RealD2 tensor_reduced; - typedef RealD2 scalar_object; - typedef RealD2 scalar_objectD; - typedef ComplexD2 Complexified; - typedef RealD2 Realified; - typedef RealD2 DoublePrecision; - typedef RealD2 DoublePrecision2; - }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexF scalar_type; typedef ComplexD scalar_typeD; @@ -175,7 +127,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexF Complexified; typedef RealF Realified; typedef ComplexD DoublePrecision; - typedef ComplexD2 DoublePrecision2; + typedef ComplexD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexD scalar_type; @@ -220,19 +172,6 @@ NAMESPACE_BEGIN(Grid); }; #endif - template<> struct GridTypeMapper : public GridTypeMapper_Base { - typedef ComplexD2 scalar_type; - typedef ComplexD2 scalar_typeD; - typedef ComplexD2 vector_type; - typedef ComplexD2 vector_typeD; - typedef ComplexD2 tensor_reduced; - typedef ComplexD2 scalar_object; - typedef ComplexD2 scalar_objectD; - typedef ComplexD2 Complexified; - typedef RealD2 Realified; - typedef ComplexD2 DoublePrecision; - typedef ComplexD2 DoublePrecision2; - }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef Integer scalar_type; typedef Integer scalar_typeD; @@ -274,13 +213,13 @@ NAMESPACE_BEGIN(Grid); typedef vRealD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { - typedef RealD2 scalar_type; - typedef RealD2 scalar_typeD; + typedef RealD scalar_type; + typedef RealD scalar_typeD; typedef vRealD2 vector_type; typedef vRealD2 vector_typeD; typedef vRealD2 tensor_reduced; - typedef RealD2 scalar_object; - typedef RealD2 scalar_objectD; + typedef RealD scalar_object; + typedef RealD scalar_objectD; typedef vComplexD2 Complexified; typedef vRealD2 Realified; typedef vRealD2 DoublePrecision; @@ -341,13 +280,13 @@ NAMESPACE_BEGIN(Grid); typedef vComplexD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { - typedef ComplexD2 scalar_type; - typedef ComplexD2 scalar_typeD; + typedef ComplexD scalar_type; + typedef ComplexD scalar_typeD; typedef vComplexD2 vector_type; typedef vComplexD2 vector_typeD; typedef vComplexD2 tensor_reduced; - typedef ComplexD2 scalar_object; - typedef ComplexD2 scalar_objectD; + typedef ComplexD scalar_object; + typedef ComplexD scalar_objectD; typedef vComplexD2 Complexified; typedef vRealD2 Realified; typedef vComplexD2 DoublePrecision; From af9ecb8b41fe48628b713be8dee191a13bf20b34 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:56:55 -0400 Subject: [PATCH 154/240] Current tests compiling --- HMC/Mobius2p1f.cc | 6 ++-- HMC/Mobius2p1fEOFA.cc | 8 +++--- HMC/Mobius2p1fEOFA_F1.cc | 8 +++--- HMC/Mobius2p1fRHMC.cc | 6 ++-- HMC/Mobius2p1f_DD_RHMC.cc | 2 +- HMC/Mobius2p1f_DD_RHMC_96I.cc | 8 +++--- HMC/Mobius2p1f_DD_RHMC_96I_mixed.cc | 10 +++---- HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc | 34 +++++++++++++++++------ 8 files changed, 50 insertions(+), 32 deletions(-) diff --git a/HMC/Mobius2p1f.cc b/HMC/Mobius2p1f.cc index 5f82e0e7..4ab1f20f 100644 --- a/HMC/Mobius2p1f.cc +++ b/HMC/Mobius2p1f.cc @@ -39,7 +39,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef WilsonImplR FermionImplPolicy; - typedef MobiusFermionR FermionAction; + typedef MobiusFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; typedef Grid::XmlReader Serialiser; @@ -133,8 +133,8 @@ int main(int argc, char **argv) { //////////////////////////////////// // FermionAction StrangeOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_mass,M5,b,c, Params); - // DomainWallEOFAFermionR Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5); - // DomainWallEOFAFermionR Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5); + // DomainWallEOFAFermionD Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5); + // DomainWallEOFAFermionD Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5); // ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L,Strange_Op_R,CG,ofp, false); FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); diff --git a/HMC/Mobius2p1fEOFA.cc b/HMC/Mobius2p1fEOFA.cc index b1294da5..c961cbc9 100644 --- a/HMC/Mobius2p1fEOFA.cc +++ b/HMC/Mobius2p1fEOFA.cc @@ -175,9 +175,9 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef WilsonImplR FermionImplPolicy; - typedef MobiusFermionR FermionAction; + typedef MobiusFermionD FermionAction; typedef MobiusFermionF FermionActionF; - typedef MobiusEOFAFermionR FermionEOFAAction; + typedef MobiusEOFAFermionD FermionEOFAAction; typedef MobiusEOFAFermionF FermionEOFAActionF; typedef typename FermionAction::FermionField FermionField; typedef typename FermionActionF::FermionField FermionFieldF; @@ -293,9 +293,9 @@ int main(int argc, char **argv) { OFRp.precision= 50; - MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); - MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); ConjugateGradient ActionCG(ActionStoppingCondition,MaxCGIterations); diff --git a/HMC/Mobius2p1fEOFA_F1.cc b/HMC/Mobius2p1fEOFA_F1.cc index 3f0a7bf6..f910d69e 100644 --- a/HMC/Mobius2p1fEOFA_F1.cc +++ b/HMC/Mobius2p1fEOFA_F1.cc @@ -159,9 +159,9 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef WilsonImplR FermionImplPolicy; - typedef MobiusFermionR FermionAction; + typedef MobiusFermionD FermionAction; typedef MobiusFermionF FermionActionF; - typedef MobiusEOFAFermionR FermionEOFAAction; + typedef MobiusEOFAFermionD FermionEOFAAction; typedef MobiusEOFAFermionF FermionEOFAActionF; typedef typename FermionAction::FermionField FermionField; typedef typename FermionActionF::FermionField FermionFieldF; @@ -281,9 +281,9 @@ int main(int argc, char **argv) { OFRp.precision= 50; - MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); - MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); ConjugateGradient ActionCG(ActionStoppingCondition,MaxCGIterations); diff --git a/HMC/Mobius2p1fRHMC.cc b/HMC/Mobius2p1fRHMC.cc index b958d548..288a6c54 100644 --- a/HMC/Mobius2p1fRHMC.cc +++ b/HMC/Mobius2p1fRHMC.cc @@ -39,7 +39,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef WilsonImplR FermionImplPolicy; - typedef MobiusFermionR FermionAction; + typedef MobiusFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; typedef Grid::XmlReader Serialiser; @@ -137,8 +137,8 @@ int main(int argc, char **argv) { //////////////////////////////////// // FermionAction StrangeOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_mass,M5,b,c, Params); - // DomainWallEOFAFermionR Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5); - // DomainWallEOFAFermionR Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5); + // DomainWallEOFAFermionD Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5); + // DomainWallEOFAFermionD Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5); // ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L,Strange_Op_R,CG,ofp, false); FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); diff --git a/HMC/Mobius2p1f_DD_RHMC.cc b/HMC/Mobius2p1f_DD_RHMC.cc index aca408e7..39b4c1dd 100644 --- a/HMC/Mobius2p1f_DD_RHMC.cc +++ b/HMC/Mobius2p1f_DD_RHMC.cc @@ -37,7 +37,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef WilsonImplR FermionImplPolicy; - typedef MobiusFermionR FermionAction; + typedef MobiusFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; typedef Grid::XmlReader Serialiser; diff --git a/HMC/Mobius2p1f_DD_RHMC_96I.cc b/HMC/Mobius2p1f_DD_RHMC_96I.cc index a6a2f26c..5158aed9 100644 --- a/HMC/Mobius2p1f_DD_RHMC_96I.cc +++ b/HMC/Mobius2p1f_DD_RHMC_96I.cc @@ -37,7 +37,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef WilsonImplR FermionImplPolicy; - typedef MobiusFermionR FermionAction; + typedef MobiusFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; typedef Grid::XmlReader Serialiser; @@ -286,8 +286,8 @@ int main(int argc, char **argv) { // ii) Break low bound, how rapidly? // iii) Run lanczos // iv) Have CG return spectral range estimate - FermionField vec(StrangeOp.FermionRedBlackGrid()); - FermionField res(StrangeOp.FermionRedBlackGrid()); + FermionField vec(StrangeOp.FermionDedBlackGrid()); + FermionField res(StrangeOp.FermionDedBlackGrid()); vec = 1; // Fill with any old junk std::cout << "Bounds check on strange operator mass "<< StrangeOp.Mass()<(Block4,Width)); ////////////////////////// @@ -311,7 +311,7 @@ int main(int argc, char **argv) { // double StoppingCondition = 1e-14; // double MDStoppingCondition = 1e-9; double StoppingCondition = 1e-10; - double MDStoppingCondition = 1e-7; + double MDStoppingCondition = 1e-6; double MDStoppingConditionLoose = 1e-6; double MaxCGIterations = 300000; ConjugateGradient CG(StoppingCondition,MaxCGIterations); diff --git a/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc b/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc index 732c4666..63315ec3 100644 --- a/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc +++ b/HMC/Mobius2p1f_DD_RHMC_96I_mixedmshift.cc @@ -128,14 +128,9 @@ template MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); -#else - std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); -#endif MPCG(src,psi); } }; @@ -153,7 +148,7 @@ int main(int argc, char **argv) { typedef WilsonImplR FermionImplPolicy; typedef WilsonImplF FermionImplPolicyF; - typedef MobiusFermionR FermionAction; + typedef MobiusFermionD FermionAction; typedef MobiusFermionF FermionActionF; typedef typename FermionAction::FermionField FermionField; typedef typename FermionActionF::FermionField FermionFieldF; @@ -300,9 +295,13 @@ int main(int argc, char **argv) { // These lines are unecessary if BC are all periodic std::vector boundary = {1,1,1,-1}; FermionAction::ImplParams Params(boundary); - Params.dirichlet=NonDirichlet; FermionAction::ImplParams ParamsDir(boundary); + FermionActionF::ImplParams ParamsF(boundary); + FermionActionF::ImplParams ParamsDirF(boundary); + Params.dirichlet=NonDirichlet; + ParamsF.dirichlet=NonDirichlet; ParamsDir.dirichlet=Dirichlet; + ParamsDirF.dirichlet=Dirichlet; // double StoppingCondition = 1e-14; // double MDStoppingCondition = 1e-9; @@ -323,15 +322,34 @@ int main(int argc, char **argv) { //////////////////////////////////// // Strange action //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); FermionAction StrangeOpDir (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, ParamsDir); FermionAction StrangePauliVillarsOpDir(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, ParamsDir); - + + FermionActionF StrangeOpF (UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,b,c, ParamsF); + FermionActionF StrangePauliVillarsOpF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,pv_mass, M5,b,c, ParamsF); + + FermionActionF StrangeOpDirF (UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,strange_mass,M5,b,c, ParamsDirF); + FermionActionF StrangePauliVillarsOpDirF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,pv_mass, M5,b,c, ParamsDirF); + +#if 1 + OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction StrangePseudoFermionBdy(StrangeOpDir,StrangeOp, + StrangeOpDirF,StrangeOpF, + SFRp,500); + OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir, + StrangePauliVillarsOpDirF,StrangeOpDirF, + SFRp,500); + OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir, + StrangePauliVillarsOpF,StrangePauliVillarsOpDirF, + SFRp,500); +#else OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionBdy(StrangeOpDir,StrangeOp,SFRp); OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionLocal(StrangePauliVillarsOpDir,StrangeOpDir,SFRp); OneFlavourEvenOddRatioRationalPseudoFermionAction StrangePseudoFermionPVBdy(StrangePauliVillarsOp,StrangePauliVillarsOpDir,SFRp); +#endif Level1.push_back(&StrangePseudoFermionBdy); // ok Level2.push_back(&StrangePseudoFermionLocal); Level1.push_back(&StrangePseudoFermionPVBdy); //ok From 25df2d2c3ba7d3e5f3dd66cbc12d673851d95668 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:57:12 -0400 Subject: [PATCH 155/240] Various precision options --- benchmarks/Benchmark_dwf_fp32.cc | 113 +++++++++++++++++-------------- 1 file changed, 62 insertions(+), 51 deletions(-) diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 5ee764c4..029b2016 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -123,26 +123,44 @@ void Benchmark(int Ls, Coordinate Dirichlet) long unsigned int single_site_flops = 8*Nc*(7+16*Nc); - GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); +#define DOUBLE +#ifdef SINGLE + typedef vComplexF Simd; + typedef LatticeFermionF FermionField; + typedef LatticeGaugeFieldF GaugeField; + typedef LatticeColourMatrixF ColourMatrixField; + typedef DomainWallFermionF FermionAction; +#endif +#ifdef DOUBLE + typedef vComplexD Simd; + typedef LatticeFermionD FermionField; + typedef LatticeGaugeFieldD GaugeField; + typedef LatticeColourMatrixD ColourMatrixField; + typedef DomainWallFermionD FermionAction; +#endif +#ifdef DOUBLE2 + typedef vComplexD2 Simd; + typedef LatticeFermionD2 FermionField; + typedef LatticeGaugeFieldD2 GaugeField; + typedef LatticeColourMatrixD2 ColourMatrixField; + typedef DomainWallFermionD2 FermionAction; +#endif + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); - GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi()); - GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); - GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); - GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); - - std::vector seeds4({1,2,3,4}); - std::vector seeds5({5,6,7,8}); - std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); - LatticeFermionF src (FGrid); random(RNG5,src); + + FermionField src (FGrid); random(RNG5,src); #if 0 src = Zero(); { @@ -158,14 +176,14 @@ void Benchmark(int Ls, Coordinate Dirichlet) src = src*N2; #endif - LatticeFermionF result(FGrid); result=Zero(); - LatticeFermionF ref(FGrid); ref=Zero(); - LatticeFermionF tmp(FGrid); - LatticeFermionF err(FGrid); + FermionField result(FGrid); result=Zero(); + FermionField ref(FGrid); ref=Zero(); + FermionField tmp(FGrid); + FermionField err(FGrid); std::cout << GridLogMessage << "Drawing gauge field" << std::endl; - LatticeGaugeFieldF Umu(UGrid); - LatticeGaugeFieldF UmuCopy(UGrid); + GaugeField Umu(UGrid); + GaugeField UmuCopy(UGrid); SU::HotConfiguration(RNG4,Umu); UmuCopy=Umu; std::cout << GridLogMessage << "Random gauge initialised " << std::endl; @@ -179,13 +197,13 @@ void Benchmark(int Ls, Coordinate Dirichlet) std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl; std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl; - DirichletFilter Filter(Block); + DirichletFilter Filter(Block); Filter.applyFilter(Umu); //////////////////////////////////// // Naive wilson implementation //////////////////////////////////// - std::vector U(4,UGrid); + std::vector U(4,UGrid); for(int mu=0;mu(Umu,mu); } @@ -236,10 +254,8 @@ void Benchmark(int Ls, Coordinate Dirichlet) std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); Dw.Dhop(src,result,0); @@ -270,8 +287,8 @@ void Benchmark(int Ls, Coordinate Dirichlet) double volume=Ls; for(int mu=0;mu1.0e-4) ) { + if(( n2e>1.0e-4) ) { std::cout<Barrier(); exit(-1); } - assert (norm2(err)< 1.0e-4 ); + assert (n2e< 1.0e-4 ); } if (1) @@ -348,14 +364,16 @@ void Benchmark(int Ls, Coordinate Dirichlet) std::cout<Barrier(); @@ -414,8 +424,9 @@ void Benchmark(int Ls, Coordinate Dirichlet) setCheckerboard(r_eo,r_e); err = r_eo-result; - std::cout< Date: Tue, 27 Sep 2022 10:58:00 -0400 Subject: [PATCH 156/240] Ticked off a few items --- TODO | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/TODO b/TODO index e23e040d..12c523a6 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,11 @@ +====== +DDHMC +====== +-- Reliable Update CG - DONE +-- Multishift Mixed Precision - DONE +-- Pole dependent residual - DONE + +======= -- comms threads issue?? -- Part done: Staggered kernel performance on GPU @@ -8,7 +16,7 @@ General - Make representations code take Gimpl - Simplify the HMCand remove modules - Lattice_arith - are the mult, mac etc.. still needed after ET engine? -- Lattice_rng - faster local only loop in init +- Lattice_rng - faster local only loop in init -- DDHMC - Audit: accelerate A2Autils -- off critical path for HMC ========================================================= From fad2f969d9ec25ccaf028ee4aca65874a3f778b2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 27 Sep 2022 10:58:43 -0400 Subject: [PATCH 157/240] Summit up to date --- systems/Summit/comms.4node | 179 ---------------------------- systems/Summit/config-command | 2 +- systems/Summit/dwf.24.4node | 218 ++++++++++------------------------ systems/Summit/dwf.32.4node | 197 +++++++----------------------- systems/Summit/dwf4.lsf | 13 +- 5 files changed, 112 insertions(+), 497 deletions(-) delete mode 100644 systems/Summit/comms.4node diff --git a/systems/Summit/comms.4node b/systems/Summit/comms.4node deleted file mode 100644 index b0df0801..00000000 --- a/systems/Summit/comms.4node +++ /dev/null @@ -1,179 +0,0 @@ -OPENMPI detected -AcceleratorCudaInit[0]: ======================== -AcceleratorCudaInit[0]: Device Number : 0 -AcceleratorCudaInit[0]: ======================== -AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB -AcceleratorCudaInit[0]: totalGlobalMem: 16911433728 -AcceleratorCudaInit[0]: managedMemory: 1 -AcceleratorCudaInit[0]: isMultiGpuBoard: 0 -AcceleratorCudaInit[0]: warpSize: 32 -AcceleratorCudaInit[0]: pciBusID: 4 -AcceleratorCudaInit[0]: pciDeviceID: 0 -AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) -AcceleratorCudaInit: rank 0 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 0 device 0 bus id: 0004:04:00.0 -AcceleratorCudaInit: ================================================ -SharedMemoryMpi: World communicator of size 24 -SharedMemoryMpi: Node communicator of size 6 -0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x200060000000 for comms buffers -Setting up IPC - -__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ -__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ -__|_ | | | | | | | | | | | | _|__ -__|_ _|__ -__|_ GGGG RRRR III DDDD _|__ -__|_ G R R I D D _|__ -__|_ G R R I D D _|__ -__|_ G GG RRRR I D D _|__ -__|_ G G R R I D D _|__ -__|_ GGGG R R III DDDD _|__ -__|_ _|__ -__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ -__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ - | | | | | | | | | | | | | | - - -Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. -Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean - -Grid : Message : ================================================ -Grid : Message : MPI is initialised and logging filters activated -Grid : Message : ================================================ -Grid : Message : Requested 1073741824 byte stencil comms buffers -AcceleratorCudaInit: rank 1 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 1 device 1 bus id: 0004:05:00.0 -AcceleratorCudaInit: rank 2 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 2 device 2 bus id: 0004:06:00.0 -AcceleratorCudaInit: rank 5 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 5 device 5 bus id: 0035:05:00.0 -AcceleratorCudaInit: rank 4 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 4 device 4 bus id: 0035:04:00.0 -AcceleratorCudaInit: rank 3 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 3 device 3 bus id: 0035:03:00.0 -Grid : Message : MemoryManager Cache 13529146982 bytes -Grid : Message : MemoryManager::Init() setting up -Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 -Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory -Grid : Message : MemoryManager::Init() Using cudaMalloc -Grid : Message : 2.137929 s : Grid is setup to use 6 threads -Grid : Message : 2.137941 s : Number of iterations to average: 250 -Grid : Message : 2.137950 s : ==================================================================================================== -Grid : Message : 2.137958 s : = Benchmarking sequential halo exchange from host memory -Grid : Message : 2.137966 s : ==================================================================================================== -Grid : Message : 2.137974 s : L Ls bytes MB/s uni MB/s bidi -AcceleratorCudaInit: rank 22 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 10 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 15 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 21 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 20 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 7 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 9 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 11 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 8 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 6 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 19 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 23 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 18 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 12 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 16 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 13 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 14 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 17 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -Grid : Message : 2.604949 s : 8 8 393216 89973.9 179947.8 -Grid : Message : 2.668249 s : 8 8 393216 18650.3 37300.5 -Grid : Message : 2.732288 s : 8 8 393216 18428.5 36857.1 -Grid : Message : 2.753565 s : 8 8 393216 55497.2 110994.4 -Grid : Message : 2.808960 s : 12 8 1327104 100181.5 200363.0 -Grid : Message : 3.226900 s : 12 8 1327104 20600.5 41201.0 -Grid : Message : 3.167459 s : 12 8 1327104 24104.6 48209.2 -Grid : Message : 3.227660 s : 12 8 1327104 66156.7 132313.5 -Grid : Message : 3.413570 s : 16 8 3145728 56174.4 112348.8 -Grid : Message : 3.802697 s : 16 8 3145728 24255.9 48511.7 -Grid : Message : 4.190498 s : 16 8 3145728 24336.7 48673.4 -Grid : Message : 4.385171 s : 16 8 3145728 48484.1 96968.2 -Grid : Message : 4.805284 s : 20 8 6144000 46380.5 92761.1 -Grid : Message : 5.562975 s : 20 8 6144000 24328.5 48656.9 -Grid : Message : 6.322562 s : 20 8 6144000 24266.7 48533.4 -Grid : Message : 6.773598 s : 20 8 6144000 40868.5 81736.9 -Grid : Message : 7.600999 s : 24 8 10616832 40198.3 80396.6 -Grid : Message : 8.912917 s : 24 8 10616832 24279.5 48559.1 -Grid : Message : 10.220961 s : 24 8 10616832 24350.2 48700.4 -Grid : Message : 11.728250 s : 24 8 10616832 37390.9 74781.8 -Grid : Message : 12.497258 s : 28 8 16859136 36792.2 73584.5 -Grid : Message : 14.585387 s : 28 8 16859136 24222.2 48444.3 -Grid : Message : 16.664783 s : 28 8 16859136 24323.4 48646.8 -Grid : Message : 17.955238 s : 28 8 16859136 39194.7 78389.4 -Grid : Message : 20.136479 s : 32 8 25165824 35718.3 71436.5 -Grid : Message : 23.241958 s : 32 8 25165824 24311.4 48622.9 -Grid : Message : 26.344810 s : 32 8 25165824 24331.9 48663.7 -Grid : Message : 28.384420 s : 32 8 25165824 37016.3 74032.7 -Grid : Message : 28.388879 s : ==================================================================================================== -Grid : Message : 28.388894 s : = Benchmarking sequential halo exchange from GPU memory -Grid : Message : 28.388909 s : ==================================================================================================== -Grid : Message : 28.388924 s : L Ls bytes MB/s uni MB/s bidi -Grid : Message : 28.553993 s : 8 8 393216 8272.4 16544.7 -Grid : Message : 28.679592 s : 8 8 393216 9395.4 18790.8 -Grid : Message : 28.811112 s : 8 8 393216 8971.0 17942.0 -Grid : Message : 28.843770 s : 8 8 393216 36145.6 72291.2 -Grid : Message : 28.981754 s : 12 8 1327104 49591.6 99183.2 -Grid : Message : 29.299764 s : 12 8 1327104 12520.8 25041.7 -Grid : Message : 29.620288 s : 12 8 1327104 12422.2 24844.4 -Grid : Message : 29.657645 s : 12 8 1327104 106637.5 213275.1 -Grid : Message : 29.952933 s : 16 8 3145728 43939.2 87878.5 -Grid : Message : 30.585411 s : 16 8 3145728 14922.1 29844.2 -Grid : Message : 31.219781 s : 16 8 3145728 14877.2 29754.4 -Grid : Message : 31.285017 s : 16 8 3145728 144724.3 289448.7 -Grid : Message : 31.706443 s : 20 8 6144000 54676.2 109352.4 -Grid : Message : 32.739205 s : 20 8 6144000 17848.0 35696.1 -Grid : Message : 33.771852 s : 20 8 6144000 17849.9 35699.7 -Grid : Message : 33.871981 s : 20 8 6144000 184141.4 368282.8 -Grid : Message : 34.536808 s : 24 8 10616832 55784.3 111568.6 -Grid : Message : 36.275648 s : 24 8 10616832 18317.6 36635.3 -Grid : Message : 37.997181 s : 24 8 10616832 18501.7 37003.4 -Grid : Message : 38.140442 s : 24 8 10616832 222383.9 444767.9 -Grid : Message : 39.177222 s : 28 8 16859136 56609.7 113219.4 -Grid : Message : 41.874755 s : 28 8 16859136 18749.9 37499.8 -Grid : Message : 44.529381 s : 28 8 16859136 19052.9 38105.8 -Grid : Message : 44.742192 s : 28 8 16859136 237717.1 475434.2 -Grid : Message : 46.184000 s : 32 8 25165824 57091.2 114182.4 -Grid : Message : 50.734740 s : 32 8 25165824 19411.0 38821.9 -Grid : Message : 53.931228 s : 32 8 25165824 19570.6 39141.2 -Grid : Message : 54.238467 s : 32 8 25165824 245765.6 491531.2 -Grid : Message : 54.268664 s : ==================================================================================================== -Grid : Message : 54.268680 s : = All done; Bye Bye -Grid : Message : 54.268691 s : ==================================================================================================== diff --git a/systems/Summit/config-command b/systems/Summit/config-command index 4caf652e..2a856be0 100644 --- a/systems/Summit/config-command +++ b/systems/Summit/config-command @@ -3,7 +3,7 @@ --enable-gen-simd-width=32 \ --enable-unified=no \ --enable-shm=no \ - --disable-gparity \ + --enable-gparity \ --disable-setdevice \ --disable-fermion-reps \ --enable-accelerator=cuda \ diff --git a/systems/Summit/dwf.24.4node b/systems/Summit/dwf.24.4node index 212e471c..d0ca9697 100644 --- a/systems/Summit/dwf.24.4node +++ b/systems/Summit/dwf.24.4node @@ -10,19 +10,16 @@ AcceleratorCudaInit[0]: warpSize: 32 AcceleratorCudaInit[0]: pciBusID: 4 AcceleratorCudaInit[0]: pciDeviceID: 0 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) -AcceleratorCudaInit: rank 0 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses +AcceleratorCudaInit: a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-setdevice=no local rank 0 device 0 bus id: 0004:04:00.0 AcceleratorCudaInit: ================================================ SharedMemoryMpi: World communicator of size 24 -SharedMemoryMpi: Node communicator of size 6 -0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers -AcceleratorCudaInit: rank 3 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 3 device 3 bus id: 0035:03:00.0 -AcceleratorCudaInit: rank 5 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 5 device 5 bus id: 0035:05:00.0 +SharedMemoryMpi: Node communicator of size 1 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x200080000000 - 2000bfffffff for comms buffers Setting up IPC __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ @@ -36,6 +33,11 @@ __|_ G GG RRRR I D D _|__ __|_ G G R R I D D _|__ __|_ GGGG R R III DDDD _|__ __|_ _|__ +local rank 5 device 0 bus id: 0035:05:00.0 +local rank 1 device 0 bus id: 0004:05:00.0 +local rank 2 device 0 bus id: 0004:06:00.0 +local rank 3 device 0 bus id: 0035:03:00.0 +local rank 4 device 0 bus id: 0035:04:00.0 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ | | | | | | | | | | | | | | @@ -45,15 +47,6 @@ Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli a This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -AcceleratorCudaInit: rank 4 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 4 device 4 bus id: 0035:04:00.0 -AcceleratorCudaInit: rank 1 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 1 device 1 bus id: 0004:05:00.0 -AcceleratorCudaInit: rank 2 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 2 device 2 bus id: 0004:06:00.0 the Free Software Foundation; either version 2 of the License, or (at your option) any later version. @@ -61,146 +54,63 @@ This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. -Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean +Current Grid git commit hash=1713de35c0dc339564661dd7df8a72583f889e91: (HEAD -> feature/dirichlet) uncommited changes Grid : Message : ================================================ Grid : Message : MPI is initialised and logging filters activated Grid : Message : ================================================ -Grid : Message : Requested 2147483648 byte stencil comms buffers -Grid : Message : MemoryManager Cache 8388608000 bytes +Grid : Message : Requested 1073741824 byte stencil comms buffers +Grid : Message : MemoryManager Cache 4194304000 bytes Grid : Message : MemoryManager::Init() setting up Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory Grid : Message : MemoryManager::Init() Using cudaMalloc -Grid : Message : 1.731905 s : Grid Layout -Grid : Message : 1.731915 s : Global lattice size : 48 48 48 72 -Grid : Message : 1.731928 s : OpenMP threads : 6 -Grid : Message : 1.731938 s : MPI tasks : 2 2 2 3 -AcceleratorCudaInit: rank 9 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 23 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 22 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 21 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 18 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 6 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 7 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 10 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 8 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 11 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 20 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 19 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 13 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 12 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 14 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 16 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 15 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 17 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -Grid : Message : 2.683494 s : Making s innermost grids -Grid : Message : 2.780034 s : Initialising 4d RNG -Grid : Message : 2.833099 s : Intialising parallel RNG with unique string 'The 4D RNG' -Grid : Message : 2.833121 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 -Grid : Message : 2.916841 s : Initialising 5d RNG -Grid : Message : 3.762880 s : Intialising parallel RNG with unique string 'The 5D RNG' -Grid : Message : 3.762902 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a -Grid : Message : 5.264345 s : Initialised RNGs -Grid : Message : 6.489904 s : Drawing gauge field -Grid : Message : 6.729262 s : Random gauge initialised -Grid : Message : 7.781273 s : Setting up Cshift based reference -Grid : Message : 8.725313 s : ***************************************************************** -Grid : Message : 8.725332 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm -Grid : Message : 8.725342 s : ***************************************************************** -Grid : Message : 8.725352 s : ***************************************************************** -Grid : Message : 8.725362 s : * Benchmarking DomainWallFermionR::Dhop -Grid : Message : 8.725372 s : * Vectorising space-time by 4 -Grid : Message : 8.725383 s : * VComplexF size is 32 B -Grid : Message : 8.725395 s : * SINGLE precision -Grid : Message : 8.725405 s : * Using Overlapped Comms/Compute -Grid : Message : 8.725415 s : * Using GENERIC Nc WilsonKernels -Grid : Message : 8.725425 s : ***************************************************************** -Grid : Message : 9.465229 s : Called warmup -Grid : Message : 58.646066 s : Called Dw 3000 times in 4.91764e+07 us -Grid : Message : 58.646121 s : mflop/s = 1.02592e+07 -Grid : Message : 58.646134 s : mflop/s per rank = 427468 -Grid : Message : 58.646145 s : mflop/s per node = 2.56481e+06 -Grid : Message : 58.646156 s : RF GiB/s (base 2) = 20846.5 -Grid : Message : 58.646166 s : mem GiB/s (base 2) = 13029.1 -Grid : Message : 58.648008 s : norm diff 1.04778e-13 -Grid : Message : 58.734885 s : #### Dhop calls report -Grid : Message : 58.734897 s : WilsonFermion5D Number of DhopEO Calls : 6002 -Grid : Message : 58.734909 s : WilsonFermion5D TotalTime /Calls : 8217.71 us -Grid : Message : 58.734922 s : WilsonFermion5D CommTime /Calls : 7109.5 us -Grid : Message : 58.734933 s : WilsonFermion5D FaceTime /Calls : 446.623 us -Grid : Message : 58.734943 s : WilsonFermion5D ComputeTime1/Calls : 18.0558 us -Grid : Message : 58.734953 s : WilsonFermion5D ComputeTime2/Calls : 731.097 us -Grid : Message : 58.734979 s : Average mflops/s per call : 4.8157e+09 -Grid : Message : 58.734989 s : Average mflops/s per call per rank : 2.00654e+08 -Grid : Message : 58.734999 s : Average mflops/s per call per node : 1.20393e+09 -Grid : Message : 58.735008 s : Average mflops/s per call (full) : 1.04183e+07 -Grid : Message : 58.735017 s : Average mflops/s per call per rank (full): 434094 -Grid : Message : 58.735026 s : Average mflops/s per call per node (full): 2.60456e+06 -Grid : Message : 58.735035 s : WilsonFermion5D Stencil -Grid : Message : 58.735043 s : WilsonFermion5D StencilEven -Grid : Message : 58.735051 s : WilsonFermion5D StencilOdd -Grid : Message : 58.735059 s : WilsonFermion5D Stencil Reporti() -Grid : Message : 58.735067 s : WilsonFermion5D StencilEven Reporti() -Grid : Message : 58.735075 s : WilsonFermion5D StencilOdd Reporti() -Grid : Message : 64.934380 s : Compare to naive wilson implementation Dag to verify correctness -Grid : Message : 64.934740 s : Called DwDag -Grid : Message : 64.934870 s : norm dag result 12.0422 -Grid : Message : 64.120756 s : norm dag ref 12.0422 -Grid : Message : 64.149389 s : norm dag diff 7.6644e-14 -Grid : Message : 64.317786 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec -Grid : Message : 64.465331 s : src_e0.499995 -Grid : Message : 64.524653 s : src_o0.500005 -Grid : Message : 64.558706 s : ********************************************************* -Grid : Message : 64.558717 s : * Benchmarking DomainWallFermionF::DhopEO -Grid : Message : 64.558727 s : * Vectorising space-time by 4 -Grid : Message : 64.558737 s : * SINGLE precision -Grid : Message : 64.558745 s : * Using Overlapped Comms/Compute -Grid : Message : 64.558753 s : * Using GENERIC Nc WilsonKernels -Grid : Message : 64.558761 s : ********************************************************* -Grid : Message : 92.702145 s : Deo mflop/s = 8.97692e+06 -Grid : Message : 92.702185 s : Deo mflop/s per rank 374038 -Grid : Message : 92.702198 s : Deo mflop/s per node 2.24423e+06 -Grid : Message : 92.702209 s : #### Dhop calls report -Grid : Message : 92.702223 s : WilsonFermion5D Number of DhopEO Calls : 3001 -Grid : Message : 92.702240 s : WilsonFermion5D TotalTime /Calls : 9377.88 us -Grid : Message : 92.702257 s : WilsonFermion5D CommTime /Calls : 8221.84 us -Grid : Message : 92.702277 s : WilsonFermion5D FaceTime /Calls : 543.548 us -Grid : Message : 92.702301 s : WilsonFermion5D ComputeTime1/Calls : 20.936 us -Grid : Message : 92.702322 s : WilsonFermion5D ComputeTime2/Calls : 732.33 us -Grid : Message : 92.702376 s : Average mflops/s per call : 4.13001e+09 -Grid : Message : 92.702387 s : Average mflops/s per call per rank : 1.72084e+08 -Grid : Message : 92.702397 s : Average mflops/s per call per node : 1.0325e+09 -Grid : Message : 92.702407 s : Average mflops/s per call (full) : 9.12937e+06 -Grid : Message : 92.702416 s : Average mflops/s per call per rank (full): 380391 -Grid : Message : 92.702426 s : Average mflops/s per call per node (full): 2.28234e+06 -Grid : Message : 92.702435 s : WilsonFermion5D Stencil -Grid : Message : 92.702443 s : WilsonFermion5D StencilEven -Grid : Message : 92.702451 s : WilsonFermion5D StencilOdd -Grid : Message : 92.702459 s : WilsonFermion5D Stencil Reporti() -Grid : Message : 92.702467 s : WilsonFermion5D StencilEven Reporti() -Grid : Message : 92.702475 s : WilsonFermion5D StencilOdd Reporti() -Grid : Message : 92.772983 s : r_e6.02121 -Grid : Message : 92.786384 s : r_o6.02102 -Grid : Message : 92.799622 s : res12.0422 -Grid : Message : 93.860500 s : norm diff 0 -Grid : Message : 93.162026 s : norm diff even 0 -Grid : Message : 93.197529 s : norm diff odd 0 + + + + + + + +Grid : Message : 0.179000 s : ++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 0.196000 s : Testing with full communication +Grid : Message : 0.211000 s : ++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 0.225000 s : Grid Layout +Grid : Message : 0.233000 s : Global lattice size : 48 48 48 72 +Grid : Message : 0.246000 s : OpenMP threads : 6 +Grid : Message : 0.255000 s : MPI tasks : 2 2 2 3 +Grid : Message : 0.182200 s : Initialising 4d RNG +Grid : Message : 0.233863 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 0.233886 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 0.245805 s : Initialising 5d RNG +Grid : Message : 1.710720 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 1.710950 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 2.220272 s : Drawing gauge field +Grid : Message : 2.418119 s : Random gauge initialised +Grid : Message : 2.418142 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0] +Grid : Message : 2.418156 s : Applying BCs for Dirichlet Block4 [0 0 0 0] +Grid : Message : 2.489588 s : Setting up Cshift based reference +Grid : Message : 13.921239 s : ***************************************************************** +Grid : Message : 13.921261 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 13.921270 s : ***************************************************************** +Grid : Message : 13.921279 s : ***************************************************************** +Grid : Message : 13.921288 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 13.921296 s : * Vectorising space-time by 4 +Grid : Message : 13.921305 s : * VComplexF size is 32 B +Grid : Message : 13.921314 s : * SINGLE precision +Grid : Message : 13.921321 s : * Using Overlapped Comms/Compute +Grid : Message : 13.921328 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 13.921335 s : ***************************************************************** +Grid : Message : 14.821339 s : Called warmup +Grid : Message : 23.975467 s : Called Dw 300 times in 9.15155e+06 us +Grid : Message : 23.975528 s : mflop/s = 5.51286e+06 +Grid : Message : 23.975543 s : mflop/s per rank = 229702 +Grid : Message : 23.975557 s : mflop/s per node = 229702 +Grid : Message : 23.989684 s : norm diff 5.09279e-313 Line 291 +Grid : Message : 39.450493 s : ---------------------------------------------------------------- +Grid : Message : 39.450517 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 39.450526 s : ---------------------------------------------------------------- +Grid : Message : 39.450534 s : Called DwDag +Grid : Message : 39.450542 s : norm dag result nan +Grid : Message : 39.451564 s : norm dag ref nan +Grid : Message : 39.455714 s : norm dag diff nan Line 354 diff --git a/systems/Summit/dwf.32.4node b/systems/Summit/dwf.32.4node index eed54f2d..fe21bad8 100644 --- a/systems/Summit/dwf.32.4node +++ b/systems/Summit/dwf.32.4node @@ -10,14 +10,21 @@ AcceleratorCudaInit[0]: warpSize: 32 AcceleratorCudaInit[0]: pciBusID: 4 AcceleratorCudaInit[0]: pciDeviceID: 0 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) -AcceleratorCudaInit: rank 0 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses +AcceleratorCudaInit: a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-setdevice=no local rank 0 device 0 bus id: 0004:04:00.0 AcceleratorCudaInit: ================================================ SharedMemoryMpi: World communicator of size 24 -SharedMemoryMpi: Node communicator of size 6 -0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers +SharedMemoryMpi: Node communicator of size 1 +local rank 3 device 0 bus id: 0004:04:00.0 +local rank 2 device 0 bus id: 0004:04:00.0 +local rank 1 device 0 bus id: 0004:04:00.0 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x200080000000 - 2000bfffffff for comms buffers Setting up IPC +local rank 5 device 0 bus id: 0004:04:00.0 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ __|__|__|__|__|__|__|__|__|__|__|__|__|__|__ @@ -39,168 +46,46 @@ Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli a This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +local rank 4 device 0 bus id: 0004:04:00.0 the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -AcceleratorCudaInit: rank 2 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 2 device 2 bus id: 0004:06:00.0 -AcceleratorCudaInit: rank 1 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 1 device 1 bus id: 0004:05:00.0 -AcceleratorCudaInit: rank 4 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 4 device 4 bus id: 0035:04:00.0 -AcceleratorCudaInit: rank 3 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 3 device 3 bus id: 0035:03:00.0 -AcceleratorCudaInit: rank 5 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -local rank 5 device 5 bus id: 0035:05:00.0 GNU General Public License for more details. -Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean +Current Grid git commit hash=1713de35c0dc339564661dd7df8a72583f889e91: (HEAD -> feature/dirichlet) uncommited changes Grid : Message : ================================================ Grid : Message : MPI is initialised and logging filters activated Grid : Message : ================================================ -Grid : Message : Requested 2147483648 byte stencil comms buffers -Grid : Message : MemoryManager Cache 8388608000 bytes +Grid : Message : Requested 1073741824 byte stencil comms buffers Grid : Message : MemoryManager::Init() setting up Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2 -Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory -Grid : Message : MemoryManager::Init() Using cudaMalloc -Grid : Message : 1.544984 s : Grid Layout -Grid : Message : 1.544992 s : Global lattice size : 64 64 64 96 -Grid : Message : 1.545003 s : OpenMP threads : 6 -Grid : Message : 1.545011 s : MPI tasks : 2 2 2 3 -AcceleratorCudaInit: rank 8 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 6 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 11 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 16 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 17 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 13 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 12 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 21 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 23 setting device to node rank 5 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 22 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 19 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 18 setting device to node rank 0 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 7 setting device to node rank 1 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 10 setting device to node rank 4 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 9 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 14 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 15 setting device to node rank 3 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -AcceleratorCudaInit: rank 20 setting device to node rank 2 -AcceleratorCudaInit: Configure options --enable-setdevice=yes -Grid : Message : 2.994920 s : Making s innermost grids -Grid : Message : 2.232502 s : Initialising 4d RNG -Grid : Message : 2.397047 s : Intialising parallel RNG with unique string 'The 4D RNG' -Grid : Message : 2.397069 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 -Grid : Message : 2.653140 s : Initialising 5d RNG -Grid : Message : 5.285347 s : Intialising parallel RNG with unique string 'The 5D RNG' -Grid : Message : 5.285369 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a -Grid : Message : 9.994738 s : Initialised RNGs -Grid : Message : 13.153426 s : Drawing gauge field -Grid : Message : 13.825697 s : Random gauge initialised -Grid : Message : 18.537657 s : Setting up Cshift based reference -Grid : Message : 22.296755 s : ***************************************************************** -Grid : Message : 22.296781 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm -Grid : Message : 22.296791 s : ***************************************************************** -Grid : Message : 22.296800 s : ***************************************************************** -Grid : Message : 22.296809 s : * Benchmarking DomainWallFermionR::Dhop -Grid : Message : 22.296818 s : * Vectorising space-time by 4 -Grid : Message : 22.296828 s : * VComplexF size is 32 B -Grid : Message : 22.296838 s : * SINGLE precision -Grid : Message : 22.296847 s : * Using Overlapped Comms/Compute -Grid : Message : 22.296855 s : * Using GENERIC Nc WilsonKernels -Grid : Message : 22.296863 s : ***************************************************************** -Grid : Message : 24.746452 s : Called warmup -Grid : Message : 137.525756 s : Called Dw 3000 times in 1.12779e+08 us -Grid : Message : 137.525818 s : mflop/s = 1.41383e+07 -Grid : Message : 137.525831 s : mflop/s per rank = 589097 -Grid : Message : 137.525843 s : mflop/s per node = 3.53458e+06 -Grid : Message : 137.525854 s : RF GiB/s (base 2) = 28728.7 -Grid : Message : 137.525864 s : mem GiB/s (base 2) = 17955.5 -Grid : Message : 137.693645 s : norm diff 1.04885e-13 -Grid : Message : 137.965585 s : #### Dhop calls report -Grid : Message : 137.965598 s : WilsonFermion5D Number of DhopEO Calls : 6002 -Grid : Message : 137.965612 s : WilsonFermion5D TotalTime /Calls : 18899.7 us -Grid : Message : 137.965624 s : WilsonFermion5D CommTime /Calls : 16041.4 us -Grid : Message : 137.965634 s : WilsonFermion5D FaceTime /Calls : 859.705 us -Grid : Message : 137.965644 s : WilsonFermion5D ComputeTime1/Calls : 70.5881 us -Grid : Message : 137.965654 s : WilsonFermion5D ComputeTime2/Calls : 2094.8 us -Grid : Message : 137.965682 s : Average mflops/s per call : 3.87638e+09 -Grid : Message : 137.965692 s : Average mflops/s per call per rank : 1.61516e+08 -Grid : Message : 137.965702 s : Average mflops/s per call per node : 9.69095e+08 -Grid : Message : 137.965712 s : Average mflops/s per call (full) : 1.43168e+07 -Grid : Message : 137.965721 s : Average mflops/s per call per rank (full): 596533 -Grid : Message : 137.965730 s : Average mflops/s per call per node (full): 3.5792e+06 -Grid : Message : 137.965740 s : WilsonFermion5D Stencil -Grid : Message : 137.965748 s : WilsonFermion5D StencilEven -Grid : Message : 137.965756 s : WilsonFermion5D StencilOdd -Grid : Message : 137.965764 s : WilsonFermion5D Stencil Reporti() -Grid : Message : 137.965772 s : WilsonFermion5D StencilEven Reporti() -Grid : Message : 137.965780 s : WilsonFermion5D StencilOdd Reporti() -Grid : Message : 156.554605 s : Compare to naive wilson implementation Dag to verify correctness -Grid : Message : 156.554632 s : Called DwDag -Grid : Message : 156.554642 s : norm dag result 12.0421 -Grid : Message : 156.639265 s : norm dag ref 12.0421 -Grid : Message : 156.888281 s : norm dag diff 7.62057e-14 -Grid : Message : 157.609797 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec -Grid : Message : 158.208630 s : src_e0.499996 -Grid : Message : 158.162447 s : src_o0.500004 -Grid : Message : 158.267780 s : ********************************************************* -Grid : Message : 158.267791 s : * Benchmarking DomainWallFermionF::DhopEO -Grid : Message : 158.267801 s : * Vectorising space-time by 4 -Grid : Message : 158.267811 s : * SINGLE precision -Grid : Message : 158.267820 s : * Using Overlapped Comms/Compute -Grid : Message : 158.267828 s : * Using GENERIC Nc WilsonKernels -Grid : Message : 158.267836 s : ********************************************************* -Grid : Message : 216.487829 s : Deo mflop/s = 1.37283e+07 -Grid : Message : 216.487869 s : Deo mflop/s per rank 572011 -Grid : Message : 216.487881 s : Deo mflop/s per node 3.43206e+06 -Grid : Message : 216.487893 s : #### Dhop calls report -Grid : Message : 216.487903 s : WilsonFermion5D Number of DhopEO Calls : 3001 -Grid : Message : 216.487913 s : WilsonFermion5D TotalTime /Calls : 19399.6 us -Grid : Message : 216.487923 s : WilsonFermion5D CommTime /Calls : 16475.4 us -Grid : Message : 216.487933 s : WilsonFermion5D FaceTime /Calls : 972.393 us -Grid : Message : 216.487943 s : WilsonFermion5D ComputeTime1/Calls : 49.8474 us -Grid : Message : 216.487953 s : WilsonFermion5D ComputeTime2/Calls : 2089.93 us -Grid : Message : 216.488001 s : Average mflops/s per call : 5.39682e+09 -Grid : Message : 216.488011 s : Average mflops/s per call per rank : 2.24867e+08 -Grid : Message : 216.488020 s : Average mflops/s per call per node : 1.3492e+09 -Grid : Message : 216.488030 s : Average mflops/s per call (full) : 1.39479e+07 -Grid : Message : 216.488039 s : Average mflops/s per call per rank (full): 581162 -Grid : Message : 216.488048 s : Average mflops/s per call per node (full): 3.48697e+06 -Grid : Message : 216.488057 s : WilsonFermion5D Stencil -Grid : Message : 216.488065 s : WilsonFermion5D StencilEven -Grid : Message : 216.488073 s : WilsonFermion5D StencilOdd -Grid : Message : 216.488081 s : WilsonFermion5D Stencil Reporti() -Grid : Message : 216.488089 s : WilsonFermion5D StencilEven Reporti() -Grid : Message : 216.488097 s : WilsonFermion5D StencilOdd Reporti() -Grid : Message : 217.384495 s : r_e6.02113 -Grid : Message : 217.426121 s : r_o6.02096 -Grid : Message : 217.472636 s : res12.0421 -Grid : Message : 218.200068 s : norm diff 0 -Grid : Message : 218.645673 s : norm diff even 0 -Grid : Message : 218.816561 s : norm diff odd 0 +Grid : Message : MemoryManager::Init() Unified memory space +Grid : Message : MemoryManager::Init() Using cudaMallocManaged + + + + + + + +Grid : Message : 0.139000 s : ++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 0.151000 s : Testing with full communication +Grid : Message : 0.158000 s : ++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 0.165000 s : Grid Layout +Grid : Message : 0.171000 s : Global lattice size : 64 64 64 96 +Grid : Message : 0.181000 s : OpenMP threads : 6 +Grid : Message : 0.189000 s : MPI tasks : 2 2 2 3 +Grid : Message : 0.177717 s : Initialising 4d RNG +Grid : Message : 0.342461 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 0.342483 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 0.370454 s : Initialising 5d RNG +Grid : Message : 3.174160 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 3.174420 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 22.119339 s : Drawing gauge field +Grid : Message : 38.113060 s : Random gauge initialised +Grid : Message : 38.113320 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0] +Grid : Message : 38.113470 s : Applying BCs for Dirichlet Block4 [0 0 0 0] +Grid : Message : 43.906786 s : Setting up Cshift based reference diff --git a/systems/Summit/dwf4.lsf b/systems/Summit/dwf4.lsf index 7d940338..51560f71 100644 --- a/systems/Summit/dwf4.lsf +++ b/systems/Summit/dwf4.lsf @@ -7,16 +7,15 @@ export OMP_NUM_THREADS=6 export PAMI_IBV_ADAPTER_AFFINITY=1 export PAMI_ENABLE_STRIPING=1 +export PAMI_DISABLE_IPC=1 export OPT="--comms-concurrent --comms-overlap " -#export GRID_ALLOC_NCACHE_LARGE=1 -export APP="./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.3 " -jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.4node -APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT " -jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.24.4node -APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT " -jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.32.4node +APP="./wrap.sh ./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --device-mem 4000 --shm-force-mpi 1 $OPT " +jsrun --nrs 24 -a1 -g1 -c6 -dpacked -b packed:6 --latency_priority gpu-cpu --smpiargs="-gpu" $APP > dwf.24.4node + +APP="./wrap.sh ./benchmarks/Benchmark_comms_host_device --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --device-mem 4000 --shm-force-mpi 1 $OPT " +jsrun --smpiargs="-gpu" --nrs 4 -a6 -g6 -c42 -dpacked -b packed:6 $APP > comms.24.4node From 66d001ec9eea6d204acc8dd59041a7e09abeac31 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Mon, 3 Oct 2022 10:59:38 -0400 Subject: [PATCH 158/240] Refactored Wilson flow class; previously the class implemented both iterative and adaptive smearing, but only the iterative method was accessible through the Smearing base class. The implementation of Smearing also forced a clunky need to pass iterative smearing parameters through the constructor but adaptive smearing parameters through the function call. Now there is a WilsonFlowBase class that implements common functionality, and separate WilsonFlow (iterative) and WilsonFlowAdaptive (adaptive) classes, both of which implement Smearing virtual functions. Modified the Wilson flow adaptive smearing step size update to implement the original Ramos definition of the distance, where previously it used the norm of a difference which scales with the volume and so would choose too coarse or too fine steps depending on the volume. This is based on Chulwoo's code. Added a test comparing adaptive (with tuneable tolerance) to iterative Wilson flow smearing on a random gauge configuration. --- Grid/qcd/observables/topological_charge.h | 15 +- Grid/qcd/smearing/WilsonFlow.h | 302 ++++++++++++--------- tests/smearing/Test_WilsonFlow_adaptive.cc | 153 +++++++++++ 3 files changed, 336 insertions(+), 134 deletions(-) create mode 100644 tests/smearing/Test_WilsonFlow_adaptive.cc diff --git a/Grid/qcd/observables/topological_charge.h b/Grid/qcd/observables/topological_charge.h index 7c09a180..220ed738 100644 --- a/Grid/qcd/observables/topological_charge.h +++ b/Grid/qcd/observables/topological_charge.h @@ -31,15 +31,16 @@ directory NAMESPACE_BEGIN(Grid); + struct TopologySmearingParameters : Serializable { GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters, - int, steps, - float, step_size, int, meas_interval, - float, maxTau); + float, init_step_size, + float, maxTau, + float, tolerance); - TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f): - steps(s), step_size(ss), meas_interval(mi), maxTau(mT){} + TopologySmearingParameters(float ss = 0.0f, int mi = 0, float mT = 0.0f, float tol = 1e-4): + init_step_size(ss), meas_interval(mi), maxTau(mT), tolerance(tol){} template < class ReaderClass > TopologySmearingParameters(Reader& Reader){ @@ -97,8 +98,8 @@ public: if (Pars.do_smearing){ // using wilson flow by default here - WilsonFlow WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval); - WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau); + WilsonFlowAdaptive WF(Pars.Smearing.init_step_size, Pars.Smearing.maxTau, Pars.Smearing.tolerance, Pars.Smearing.meas_interval); + WF.smear(Usmear, U); Real T0 = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear); std::cout << GridLogMessage << std::setprecision(std::numeric_limits::digits10 + 1) << "T0 : [ " << traj << " ] "<< T0 << std::endl; diff --git a/Grid/qcd/smearing/WilsonFlow.h b/Grid/qcd/smearing/WilsonFlow.h index 0d1ee5d2..f169d02b 100644 --- a/Grid/qcd/smearing/WilsonFlow.h +++ b/Grid/qcd/smearing/WilsonFlow.h @@ -33,27 +33,25 @@ directory NAMESPACE_BEGIN(Grid); template -class WilsonFlow: public Smear{ +class WilsonFlowBase: public Smear{ public: //Store generic measurements to take during smearing process using std::function typedef std::function FunctionType; //int: step, RealD: flow time, GaugeField : the gauge field - -private: - unsigned int Nstep; - RealD epsilon; //for regular smearing this is the time step, for adaptive it is the initial time step - + +protected: std::vector< std::pair > functions; //The int maps to the measurement frequency mutable WilsonGaugeAction SG; - - //Evolve the gauge field by 1 step and update tau - void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const; - //Evolve the gauge field by 1 step and update tau and the current time step eps - void evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps, RealD maxTau) const; - + public: INHERIT_GIMPL_TYPES(Gimpl) + explicit WilsonFlowBase(unsigned int meas_interval =1): + SG(WilsonGaugeAction(3.0)) { + // WilsonGaugeAction with beta 3.0 + setDefaultMeasurements(meas_interval); + } + void resetActions(){ functions.clear(); } void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); } @@ -64,34 +62,11 @@ public: //and output to stdout void setDefaultMeasurements(int topq_meas_interval = 1); - explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1): - Nstep(Nstep), - epsilon(epsilon), - SG(WilsonGaugeAction(3.0)) { - // WilsonGaugeAction with beta 3.0 - assert(epsilon > 0.0); - LogMessage(); - setDefaultMeasurements(interval); - } - - void LogMessage() { - std::cout << GridLogMessage - << "[WilsonFlow] Nstep : " << Nstep << std::endl; - std::cout << GridLogMessage - << "[WilsonFlow] epsilon : " << epsilon << std::endl; - std::cout << GridLogMessage - << "[WilsonFlow] full trajectory : " << Nstep * epsilon << std::endl; - } - - virtual void smear(GaugeField&, const GaugeField&) const; - - virtual void derivative(GaugeField&, const GaugeField&, const GaugeField&) const { + void derivative(GaugeField&, const GaugeField&, const GaugeField&) const override{ assert(0); // undefined for WilsonFlow } - void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau) const; - //Compute t^2 for time t from the plaquette static RealD energyDensityPlaquette(const RealD t, const GaugeField& U); @@ -115,82 +90,63 @@ public: std::vector flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1); }; +//Basic iterative Wilson flow +template +class WilsonFlow: public WilsonFlowBase{ +private: + int Nstep; //number of steps + RealD epsilon; //step size + + //Evolve the gauge field by 1 step of size eps and update tau + void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const; + +public: + INHERIT_GIMPL_TYPES(Gimpl) + + //Integrate the Wilson flow for Nstep steps of size epsilon + WilsonFlow(const RealD epsilon, const int Nstep, unsigned int meas_interval = 1): WilsonFlowBase(meas_interval), Nstep(Nstep), epsilon(epsilon){} + + void smear(GaugeField& out, const GaugeField& in) const override; +}; + +//Wilson flow with adaptive step size +template +class WilsonFlowAdaptive: public WilsonFlowBase{ +private: + RealD init_epsilon; //initial step size + RealD maxTau; //integrate to t=maxTau + RealD tolerance; //integration error tolerance + + //Evolve the gauge field by 1 step and update tau and the current time step eps + // + //If the step size eps is too large that a significant integration error results, + //the gauge field (U) and tau will not be updated and the function will return 0; eps will be adjusted to a smaller + //value for the next iteration. + // + //For a successful integration step the function will return 1 + int evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps) const; + +public: + INHERIT_GIMPL_TYPES(Gimpl) + + WilsonFlowAdaptive(const RealD init_epsilon, const RealD maxTau, const RealD tolerance, unsigned int meas_interval = 1): + WilsonFlowBase(meas_interval), init_epsilon(init_epsilon), maxTau(maxTau), tolerance(tolerance){} + + void smear(GaugeField& out, const GaugeField& in) const override; +}; //////////////////////////////////////////////////////////////////////////////// // Implementations //////////////////////////////////////////////////////////////////////////////// template -void WilsonFlow::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{ - GaugeField Z(U.Grid()); - GaugeField tmp(U.Grid()); - SG.deriv(U, Z); - Z *= 0.25; // Z0 = 1/4 * F(U) - Gimpl::update_field(Z, U, -2.0*epsilon); // U = W1 = exp(ep*Z0)*W0 - - Z *= -17.0/8.0; - SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1 - Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1 - Gimpl::update_field(Z, U, -2.0*epsilon); // U_= W2 = exp(ep*Z)*W1 - - Z *= -4.0/3.0; - SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2 - Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2 - Gimpl::update_field(Z, U, -2.0*epsilon); // V(t+e) = exp(ep*Z)*W2 - tau += epsilon; -} - -template -void WilsonFlow::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps, RealD maxTau) const{ - if (maxTau - tau < eps){ - eps = maxTau-tau; - } - //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; - GaugeField Z(U.Grid()); - GaugeField Zprime(U.Grid()); - GaugeField tmp(U.Grid()), Uprime(U.Grid()); - Uprime = U; - SG.deriv(U, Z); - Zprime = -Z; - Z *= 0.25; // Z0 = 1/4 * F(U) - Gimpl::update_field(Z, U, -2.0*eps); // U = W1 = exp(ep*Z0)*W0 - - Z *= -17.0/8.0; - SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1 - Zprime += 2.0*tmp; - Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1 - Gimpl::update_field(Z, U, -2.0*eps); // U_= W2 = exp(ep*Z)*W1 - - - Z *= -4.0/3.0; - SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2 - Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2 - Gimpl::update_field(Z, U, -2.0*eps); // V(t+e) = exp(ep*Z)*W2 - - // Ramos - Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0 - // Compute distance as norm^2 of the difference - GaugeField diffU = U - Uprime; - RealD diff = norm2(diffU); - // adjust integration step - - tau += eps; - //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; - - eps = eps*0.95*std::pow(1e-4/diff,1./3.); - //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; - -} - - -template -RealD WilsonFlow::energyDensityPlaquette(const RealD t, const GaugeField& U){ +RealD WilsonFlowBase::energyDensityPlaquette(const RealD t, const GaugeField& U){ static WilsonGaugeAction SG(3.0); return 2.0 * t * t * SG.S(U)/U.Grid()->gSites(); } //Compute t^2 for time from the 1x1 cloverleaf form template -RealD WilsonFlow::energyDensityCloverleaf(const RealD t, const GaugeField& U){ +RealD WilsonFlowBase::energyDensityCloverleaf(const RealD t, const GaugeField& U){ typedef typename Gimpl::GaugeLinkField GaugeMat; typedef typename Gimpl::GaugeField GaugeLorentz; @@ -215,7 +171,7 @@ RealD WilsonFlow::energyDensityCloverleaf(const RealD t, const GaugeField template -std::vector WilsonFlow::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){ +std::vector WilsonFlowBase::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){ std::vector out; resetActions(); addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ @@ -227,13 +183,13 @@ std::vector WilsonFlow::flowMeasureEnergyDensityPlaquette(GaugeFie } template -std::vector WilsonFlow::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){ +std::vector WilsonFlowBase::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){ GaugeField V(U); return flowMeasureEnergyDensityPlaquette(V,U, measure_interval); } template -std::vector WilsonFlow::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){ +std::vector WilsonFlowBase::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){ std::vector out; resetActions(); addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ @@ -245,16 +201,52 @@ std::vector WilsonFlow::flowMeasureEnergyDensityCloverleaf(GaugeFi } template -std::vector WilsonFlow::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){ +std::vector WilsonFlowBase::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){ GaugeField V(U); return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval); } +template +void WilsonFlowBase::setDefaultMeasurements(int topq_meas_interval){ + addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){ + std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl; + }); + addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){ + std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops::TopologicalCharge(U) << std::endl; + }); +} -//#define WF_TIMING + +template +void WilsonFlow::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{ + GaugeField Z(U.Grid()); + GaugeField tmp(U.Grid()); + this->SG.deriv(U, Z); + Z *= 0.25; // Z0 = 1/4 * F(U) + Gimpl::update_field(Z, U, -2.0*epsilon); // U = W1 = exp(ep*Z0)*W0 + + Z *= -17.0/8.0; + this->SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1 + Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1 + Gimpl::update_field(Z, U, -2.0*epsilon); // U_= W2 = exp(ep*Z)*W1 + + Z *= -4.0/3.0; + this->SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2 + Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2 + Gimpl::update_field(Z, U, -2.0*epsilon); // V(t+e) = exp(ep*Z)*W2 + tau += epsilon; +} + template void WilsonFlow::smear(GaugeField& out, const GaugeField& in) const{ + std::cout << GridLogMessage + << "[WilsonFlow] Nstep : " << Nstep << std::endl; + std::cout << GridLogMessage + << "[WilsonFlow] epsilon : " << epsilon << std::endl; + std::cout << GridLogMessage + << "[WilsonFlow] full trajectory : " << Nstep * epsilon << std::endl; + out = in; RealD taus = 0.; for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement @@ -266,37 +258,93 @@ void WilsonFlow::smear(GaugeField& out, const GaugeField& in) const{ std::cout << "Time to evolve " << diff.count() << " s\n"; #endif //Perform measurements - for(auto const &meas : functions) + for(auto const &meas : this->functions) if( step % meas.first == 0 ) meas.second(step,taus,out); } } + + template -void WilsonFlow::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau) const{ - out = in; - RealD taus = 0.; - RealD eps = epsilon; - unsigned int step = 0; - do{ - step++; - //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; - evolve_step_adaptive(out, taus, eps, maxTau); - //Perform measurements - for(auto const &meas : functions) - if( step % meas.first == 0 ) meas.second(step,taus,out); - } while (taus < maxTau); +int WilsonFlowAdaptive::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps) const{ + if (maxTau - tau < eps){ + eps = maxTau-tau; + } + //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; + GaugeField Z(U.Grid()); + GaugeField Zprime(U.Grid()); + GaugeField tmp(U.Grid()), Uprime(U.Grid()), Usave(U.Grid()); + Uprime = U; + Usave = U; + + this->SG.deriv(U, Z); + Zprime = -Z; + Z *= 0.25; // Z0 = 1/4 * F(U) + Gimpl::update_field(Z, U, -2.0*eps); // U = W1 = exp(ep*Z0)*W0 + + Z *= -17.0/8.0; + this->SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1 + Zprime += 2.0*tmp; + Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1 + Gimpl::update_field(Z, U, -2.0*eps); // U_= W2 = exp(ep*Z)*W1 + + + Z *= -4.0/3.0; + this->SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2 + Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2 + Gimpl::update_field(Z, U, -2.0*eps); // V(t+e) = exp(ep*Z)*W2 + + // Ramos arXiv:1301.4388 + Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0 + + // Compute distance using Ramos' definition + GaugeField diffU = U - Uprime; + RealD max_dist = 0; + + for(int mu=0;mu(diffU, mu); + RealD dist_mu = sqrt( maxLocalNorm2(diffU_mu) ) /Nc/Nc; //maximize over sites + max_dist = std::max(max_dist, dist_mu); //maximize over mu + } + + int ret; + if(max_dist < tolerance) { + tau += eps; + ret = 1; + } else { + U = Usave; + ret = 0; + } + eps = eps*0.95*std::pow(tolerance/max_dist,1./3.); + std::cout << GridLogMessage << "Adaptive smearing : Distance: "<< max_dist <<" Step successful: " << ret << " New epsilon: " << eps << std::endl; + + return ret; } template -void WilsonFlow::setDefaultMeasurements(int topq_meas_interval){ - addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){ - std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl; - }); - addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){ - std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops::TopologicalCharge(U) << std::endl; - }); +void WilsonFlowAdaptive::smear(GaugeField& out, const GaugeField& in) const{ + std::cout << GridLogMessage + << "[WilsonFlow] initial epsilon : " << init_epsilon << std::endl; + std::cout << GridLogMessage + << "[WilsonFlow] full trajectory : " << maxTau << std::endl; + std::cout << GridLogMessage + << "[WilsonFlow] tolerance : " << tolerance << std::endl; + out = in; + RealD taus = 0.; + RealD eps = init_epsilon; + unsigned int step = 0; + do{ + int step_success = evolve_step_adaptive(out, taus, eps); + step += step_success; //step will not be incremented if the integration step fails + + //Perform measurements + if(step_success) + for(auto const &meas : this->functions) + if( step % meas.first == 0 ) meas.second(step,taus,out); + } while (taus < maxTau); } + NAMESPACE_END(Grid); diff --git a/tests/smearing/Test_WilsonFlow_adaptive.cc b/tests/smearing/Test_WilsonFlow_adaptive.cc new file mode 100644 index 00000000..23123eb9 --- /dev/null +++ b/tests/smearing/Test_WilsonFlow_adaptive.cc @@ -0,0 +1,153 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/hmc/Test_WilsonFlow_adaptive.cc + +Copyright (C) 2017 + +Author: Christopher Kelly + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +using namespace Grid; + +//Linearly interpolate between two nearest times +RealD interpolate(const RealD t_int, const std::vector > &data){ + RealD tdiff1=1e32; int t1_idx=-1; + RealD tdiff2=1e32; int t2_idx=-1; + + for(int i=0;i seeds({1, 2, 3, 4, 5}); + GridSerialRNG sRNG; + GridParallelRNG pRNG(&Grid); + pRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField U(&Grid); + SU::HotConfiguration(pRNG, U); + + int Nstep = 300; + RealD epsilon = 0.01; + RealD maxTau = Nstep*epsilon; + RealD tolerance = 1e-4; + + for(int i=1;i> tolerance; + } + } + std::cout << "Adaptive smear tolerance " << tolerance << std::endl; + + //Setup iterative Wilson flow + WilsonFlow wflow(epsilon,Nstep); + wflow.resetActions(); + + std::vector > meas_orig; + + wflow.addMeasurement(1, [&wflow,&meas_orig](int step, RealD t, const LatticeGaugeField &U){ + std::cout << GridLogMessage << "[WilsonFlow] Computing Cloverleaf energy density for step " << step << std::endl; + meas_orig.push_back( {t, wflow.energyDensityCloverleaf(t,U)} ); + }); + + //Setup adaptive Wilson flow + WilsonFlowAdaptive wflow_ad(epsilon,maxTau,tolerance); + wflow_ad.resetActions(); + + std::vector > meas_adaptive; + + wflow_ad.addMeasurement(1, [&wflow_ad,&meas_adaptive](int step, RealD t, const LatticeGaugeField &U){ + std::cout << GridLogMessage << "[WilsonFlow] Computing Cloverleaf energy density for step " << step << std::endl; + meas_adaptive.push_back( {t, wflow_ad.energyDensityCloverleaf(t,U)} ); + }); + + //Run + LatticeGaugeFieldD Vtmp(U.Grid()); + wflow.smear(Vtmp, U); //basic smear + + Vtmp = Zero(); + wflow_ad.smear(Vtmp, U); + + //Output values for plotting + { + std::ofstream out("wflow_t2E_orig.dat"); + out.precision(16); + for(auto const &e: meas_orig){ + out << e.first << " " << e.second << std::endl; + } + } + { + std::ofstream out("wflow_t2E_adaptive.dat"); + out.precision(16); + for(auto const &e: meas_adaptive){ + out << e.first << " " << e.second << std::endl; + } + } + + //Compare at times available with adaptive smearing + for(int i=0;i Date: Tue, 11 Oct 2022 14:44:55 -0400 Subject: [PATCH 159/240] RRII gpu option --- Grid/communicator/Communicator_base.h | 2 +- Grid/lattice/Lattice_ET.h | 2 +- Grid/lattice/Lattice_matrix_reduction.h | 3 - Grid/lattice/Lattice_peekpoke.h | 14 +- Grid/lattice/Lattice_reduction.h | 12 +- Grid/lattice/Lattice_reduction_gpu.h | 2 - Grid/lattice/Lattice_transfer.h | 6 +- .../CayleyFermion5DImplementation.h | 82 -- Grid/qcd/utils/SUn.h | 1 - Grid/simd/Grid_a64fx-2.h | 4 +- Grid/simd/Grid_a64fx-fixedsize.h | 8 +- Grid/simd/Grid_avx.h | 8 +- Grid/simd/Grid_avx512.h | 9 +- Grid/simd/Grid_gpu_rrii.h | 878 ++++++++++++++++++ Grid/simd/Grid_gpu_vec.h | 10 +- Grid/simd/Grid_qpx.h | 4 +- Grid/simd/Grid_sse4.h | 10 +- Grid/simd/Grid_vector_types.h | 170 ++-- Grid/simd/Simd.h | 2 + Grid/stencil/Stencil.h | 2 - Grid/tensors/Tensor_class.h | 3 + Grid/tensors/Tensor_extract_merge.h | 49 +- configure.ac | 84 +- systems/mac-arm/config-command-mpi | 4 +- 24 files changed, 1099 insertions(+), 270 deletions(-) create mode 100644 Grid/simd/Grid_gpu_rrii.h diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index cb3b9f0e..68cd36cc 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -131,7 +131,7 @@ public: template void GlobalSum(obj &o){ typedef typename obj::scalar_type scalar_type; int words = sizeof(obj)/sizeof(scalar_type); - scalar_type * ptr = (scalar_type *)& o; + scalar_type * ptr = (scalar_type *)& o; // Safe alias GlobalSumVector(ptr,words); } diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index 4a8a7423..fdd31b28 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, typename std::remove_const::type ret; typedef typename vobj::scalar_object scalar_object; - typedef typename vobj::scalar_type scalar_type; + // typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; const int Nsimd = vobj::vector_type::Nsimd(); diff --git a/Grid/lattice/Lattice_matrix_reduction.h b/Grid/lattice/Lattice_matrix_reduction.h index 7c470fef..abebbfd6 100644 --- a/Grid/lattice/Lattice_matrix_reduction.h +++ b/Grid/lattice/Lattice_matrix_reduction.h @@ -32,7 +32,6 @@ template static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = X.Grid()->GlobalDimensions()[Orthog]; @@ -82,7 +81,6 @@ template static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,int Orthog,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = X.Grid()->GlobalDimensions()[Orthog]; @@ -130,7 +128,6 @@ template static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; GridBase *FullGrid = lhs.Grid(); diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index f3b485a4..b6a36b11 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -96,9 +96,6 @@ void pokeSite(const sobj &s,Lattice &l,const Coordinate &site){ GridBase *grid=l.Grid(); - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - int Nsimd = grid->Nsimd(); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); @@ -136,9 +133,6 @@ void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ GridBase *grid=l.Grid(); - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - int Nsimd = grid->Nsimd(); assert( l.Checkerboard() == l.Grid()->CheckerBoard(site)); @@ -179,11 +173,11 @@ inline void peekLocalSite(sobj &s,const LatticeView &l,Coordinate &site) idx= grid->iIndex(site); odx= grid->oIndex(site); - scalar_type * vp = (scalar_type *)&l[odx]; + const vector_type *vp = (const vector_type *) &l[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w &l,Coordinate &site) idx= grid->iIndex(site); odx= grid->oIndex(site); - scalar_type * vp = (scalar_type *)&l[odx]; + vector_type * vp = (vector_type *)&l[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w inline RealD maxLocalNorm2(const Lattice &arg) template inline ComplexD rankInnerProduct(const Lattice &left,const Lattice &right) { - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; ComplexD nrm; @@ -296,7 +295,6 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt conformable(z,x); conformable(x,y); - typedef typename vobj::scalar_type scalar_type; // typedef typename vobj::vector_typeD vector_type; RealD nrm; @@ -341,7 +339,6 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti { conformable(left,right); - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; Vector tmp(2); @@ -597,7 +594,8 @@ static void sliceNorm (std::vector &sn,const Lattice &rhs,int Ortho template static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, int orthogdim,RealD scale=1.0) -{ +{ + // perhaps easier to just promote A to a field and use regular madd typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -628,8 +626,7 @@ static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice for(int l=0;liCoorFromIindex(icoor,l); int ldx =r+icoor[orthogdim]*rd; - scalar_type *as =(scalar_type *)&av; - as[l] = scalar_type(a[ldx])*zscale; + av.putlane(scalar_type(a[ldx])*zscale,l); } tensor_reduced at; at=av; @@ -669,7 +666,6 @@ template static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = X.Grid()->GlobalDimensions()[Orthog]; @@ -723,7 +719,6 @@ template static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,int Orthog,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = X.Grid()->GlobalDimensions()[Orthog]; @@ -777,7 +772,6 @@ template static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; GridBase *FullGrid = lhs.Grid(); diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index bad86d2a..5f5c6cc0 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -250,8 +250,6 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi template inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) { - typedef typename vobj::vector_type vector; - typedef typename vobj::scalar_typeD scalarD; typedef typename vobj::scalar_objectD sobj; sobj ret; diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index aee55e93..d9b7a704 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -677,10 +677,10 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Integer idx_t = 0; for(int d=0;d::SeqConservedCurrent(PropagatorField &q_in, #undef TopRowWithSource - -#if 0 -template -void CayleyFermion5D::MooeeInternalCompute(int dag, int inv, - Vector > & Matp, - Vector > & Matm) -{ - int Ls=this->Ls; - - GridBase *grid = this->FermionRedBlackGrid(); - int LLs = grid->_rdimensions[0]; - - if ( LLs == Ls ) { - return; // Not vectorised in 5th direction - } - - Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls); - Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls); - - for(int s=0;s::iscomplex() ) { - sp[l] = PplusMat (l*istride+s1*ostride,s2); - sm[l] = PminusMat(l*istride+s1*ostride,s2); - } else { - // if real - scalar_type tmp; - tmp = PplusMat (l*istride+s1*ostride,s2); - sp[l] = scalar_type(tmp.real(),tmp.real()); - tmp = PminusMat(l*istride+s1*ostride,s2); - sm[l] = scalar_type(tmp.real(),tmp.real()); - } - } - Matp[LLs*s2+s1] = Vp; - Matm[LLs*s2+s1] = Vm; - }} -} -#endif - NAMESPACE_END(Grid); diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index b9660c65..23eceea3 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -615,7 +615,6 @@ public: GridBase *grid = out.Grid(); typedef typename LatticeMatrixType::vector_type vector_type; - typedef typename LatticeMatrixType::scalar_type scalar_type; typedef iSinglet vTComplexType; diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 2ad8591c..a4ef70ae 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -501,7 +501,7 @@ struct Conj{ struct TimesMinusI{ // Complex template - inline vec operator()(vec a, vec b){ + inline vec operator()(vec a){ vec out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); @@ -520,7 +520,7 @@ struct TimesMinusI{ struct TimesI{ // Complex template - inline vec operator()(vec a, vec b){ + inline vec operator()(vec a){ vec out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 6b450012..5bf1b0a3 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -418,7 +418,7 @@ struct Conj{ struct TimesMinusI{ // Complex float - inline vecf operator()(vecf a, vecf b){ + inline vecf operator()(vecf a){ lutf tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_odd = acle::pg_odd(); @@ -428,7 +428,7 @@ struct TimesMinusI{ return svneg_m(a_v, pg_odd, a_v); } // Complex double - inline vecd operator()(vecd a, vecd b){ + inline vecd operator()(vecd a){ lutd tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_odd = acle::pg_odd(); @@ -441,7 +441,7 @@ struct TimesMinusI{ struct TimesI{ // Complex float - inline vecf operator()(vecf a, vecf b){ + inline vecf operator()(vecf a){ lutf tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_even = acle::pg_even(); @@ -451,7 +451,7 @@ struct TimesI{ return svneg_m(a_v, pg_even, a_v); } // Complex double - inline vecd operator()(vecd a, vecd b){ + inline vecd operator()(vecd a){ lutd tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_even = acle::pg_even(); diff --git a/Grid/simd/Grid_avx.h b/Grid/simd/Grid_avx.h index ad9800fb..f8962714 100644 --- a/Grid/simd/Grid_avx.h +++ b/Grid/simd/Grid_avx.h @@ -405,12 +405,12 @@ struct Conj{ struct TimesMinusI{ //Complex single - inline __m256 operator()(__m256 in, __m256 ret){ + inline __m256 operator()(__m256 in){ __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r } //Complex double - inline __m256d operator()(__m256d in, __m256d ret){ + inline __m256d operator()(__m256d in){ __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i return _mm256_shuffle_pd(tmp,tmp,0x5); } @@ -418,12 +418,12 @@ struct TimesMinusI{ struct TimesI{ //Complex single - inline __m256 operator()(__m256 in, __m256 ret){ + inline __m256 operator()(__m256 in){ __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r } //Complex double - inline __m256d operator()(__m256d in, __m256d ret){ + inline __m256d operator()(__m256d in){ __m256d tmp = _mm256_shuffle_pd(in,in,0x5); return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r } diff --git a/Grid/simd/Grid_avx512.h b/Grid/simd/Grid_avx512.h index 839d4554..95b96143 100644 --- a/Grid/simd/Grid_avx512.h +++ b/Grid/simd/Grid_avx512.h @@ -271,14 +271,14 @@ struct Conj{ struct TimesMinusI{ //Complex single - inline __m512 operator()(__m512 in, __m512 ret){ + inline __m512 operator()(__m512 in){ //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E?? __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); } //Complex double - inline __m512d operator()(__m512d in, __m512d ret){ + inline __m512d operator()(__m512d in){ //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag //return _mm512_shuffle_pd(tmp,tmp,0x55); __m512d tmp = _mm512_shuffle_pd(in,in,0x55); @@ -288,17 +288,16 @@ struct TimesMinusI{ struct TimesI{ //Complex single - inline __m512 operator()(__m512 in, __m512 ret){ + inline __m512 operator()(__m512 in){ __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); } //Complex double - inline __m512d operator()(__m512d in, __m512d ret){ + inline __m512d operator()(__m512d in){ __m512d tmp = _mm512_shuffle_pd(in,in,0x55); return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); } - }; // Gpermute utilities consider coalescing into 1 Gpermute diff --git a/Grid/simd/Grid_gpu_rrii.h b/Grid/simd/Grid_gpu_rrii.h new file mode 100644 index 00000000..36f343e4 --- /dev/null +++ b/Grid/simd/Grid_gpu_rrii.h @@ -0,0 +1,878 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Grid_gpu.h + + Copyright (C) 2021 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +//---------------------------------------------------------------------- +/*! @file Grid_gpu_rrii.h*/ +//---------------------------------------------------------------------- + +////////////////////////////// +// fp16 +////////////////////////////// +#ifdef GRID_CUDA +#include +#endif +#ifdef GRID_HIP +#include +#endif +#if !defined(GRID_HIP) && !defined(GRID_CUDA) +namespace Grid { + typedef struct { uint16_t x;} half; +} +#endif +namespace Grid { + accelerator_inline float half2float(half h) + { + float f; +#if defined(GRID_CUDA) || defined(GRID_HIP) + f = __half2float(h); +#else + Grid_half hh; + hh.x = h.x; + f= sfw_half_to_float(hh); +#endif + return f; + } + accelerator_inline half float2half(float f) + { + half h; +#if defined(GRID_CUDA) || defined(GRID_HIP) + h = __float2half(f); +#else + Grid_half hh = sfw_float_to_half(f); + h.x = hh.x; +#endif + return h; + } +} + + +#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) + +namespace Grid { + +//////////////////////////////////////////////////////////////////////// +// Real vector +//////////////////////////////////////////////////////////////////////// +template +struct GpuVector { + _datum rrrr[_N]; + static const int N = _N; + typedef _datum datum; +}; +template +inline accelerator GpuVector operator*(const GpuVector l,const GpuVector r) { + GpuVector ret; + for(int i=0;i +inline accelerator GpuVector operator-(const GpuVector l,const GpuVector r) { + GpuVector ret; + for(int i=0;i +inline accelerator GpuVector operator+(const GpuVector l,const GpuVector r) { + GpuVector ret; + for(int i=0;i +inline accelerator GpuVector operator/(const GpuVector l,const GpuVector r) { + GpuVector ret; + for(int i=0;i +struct GpuComplexVector { + _datum rrrr[_N]; + _datum iiii[_N]; + static const int N = _N; + typedef _datum datum; +}; +template +inline accelerator GpuComplexVector operator*(const GpuComplexVector l,const GpuComplexVector r) { + GpuComplexVector ret; + for(int i=0;i +inline accelerator GpuComplexVector operator-(const GpuComplexVector l,const GpuComplexVector r) { + GpuComplexVector ret; + for(int i=0;i +inline accelerator GpuComplexVector operator+(const GpuComplexVector l,const GpuComplexVector r) { + GpuComplexVector ret; + for(int i=0;i +inline accelerator GpuComplexVector operator/(const GpuComplexVector l,const GpuComplexVector r) { + GpuComplexVector ret; + for(int i=0;i GpuVectorRH; +typedef GpuComplexVector GpuVectorCH; +typedef GpuVector GpuVectorRF; +typedef GpuComplexVector GpuVectorCF; +typedef GpuVector GpuVectorRD; +typedef GpuComplexVector GpuVectorCD; +typedef GpuVector GpuVectorI; + +namespace Optimization { + + struct Vsplat{ + //Complex float + accelerator_inline GpuVectorCF operator()(float a, float b){ + GpuVectorCF ret; + for(int i=0;i + accelerator_inline void operator()(GpuVector a, P* Fp){ + GpuVector *vF = (GpuVector *)Fp; + *vF = a; + } + template + accelerator_inline void operator()(GpuComplexVector a, P* Fp){ + GpuComplexVector *vF = (GpuComplexVector *)Fp; + *vF = a; + } + }; + + struct Vstream{ + template + accelerator_inline void operator()(P* F,GpuVector a){ + GpuVector *vF = (GpuVector *)F; + *vF = a; + } + template + accelerator_inline void operator()(P* F,GpuComplexVector a){ + GpuComplexVector *vF = (GpuComplexVector *)F; + *vF = a; + } + }; + + struct Vset{ + // Complex float + accelerator_inline GpuVectorCF operator()(Grid::ComplexF *a){ + typedef GpuVectorCF vec; + vec ret; + for(int i=0;i + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + accelerator_inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Real float + accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){ + return a+b; + } + accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){ + return a+b; + } + accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){ + return a+b; + } + accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){ + return a+b; + } + accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){ + return a+b; + } + }; + + struct Sub{ + accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){ + return a-b; + } + accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){ + return a-b; + } + accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){ + return a-b; + } + accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){ + return a-b; + } + accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){ + return a-b; + } + }; + + struct MultRealPart{ + accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){ + typedef GpuVectorCF vec; + vec ret; + for(int i=0;i + static accelerator_inline GpuVector<_N,_datum> PermuteN(GpuVector<_N,_datum> &in) { + typedef GpuVector<_N,_datum> vec; + vec out; + unsigned int _mask = vec::N >> (n + 1); + for(int i=0;i + static accelerator_inline GpuComplexVector<_N,_datum> PermuteN(GpuComplexVector<_N,_datum> &in) { + typedef GpuComplexVector<_N,_datum> vec; + vec out; + unsigned int _mask = vec::N >> (n + 1); + for(int i=0;i static accelerator_inline vec Permute0(vec in) { return PermuteN<0,vec::N,typename vec::datum>(in); } + template static accelerator_inline vec Permute1(vec in) { return PermuteN<1,vec::N,typename vec::datum>(in); } + template static accelerator_inline vec Permute2(vec in) { return PermuteN<2,vec::N,typename vec::datum>(in); } + template static accelerator_inline vec Permute3(vec in) { return PermuteN<3,vec::N,typename vec::datum>(in); } + + }; + + struct PrecisionChange { + + //////////////////////////////////////////////////////////////////////////////////// + // Single / Half + //////////////////////////////////////////////////////////////////////////////////// + static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { + int N = GpuVectorCF::N; + GpuVectorCH h; + for(int i=0;i + static accelerator_inline void ExchangeN(GpuVector<_N,_datum> &out1, + GpuVector<_N,_datum> &out2, + GpuVector<_N,_datum> &in1, + GpuVector<_N,_datum> &in2 ) + { + typedef GpuVector<_N,_datum> vec; + unsigned int mask = vec::N >> (n + 1); + for(int i=0;i + static accelerator_inline void ExchangeN(GpuComplexVector<_N,_datum> &out1, + GpuComplexVector<_N,_datum> &out2, + GpuComplexVector<_N,_datum> &in1, + GpuComplexVector<_N,_datum> &in2 ) + { + typedef GpuComplexVector<_N,_datum> vec; + unsigned int mask = vec::N >> (n + 1); + for(int i=0;i + static accelerator_inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN<0>(out1,out2,in1,in2); + }; + template + static accelerator_inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN<1>(out1,out2,in1,in2); + }; + template + static accelerator_inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN<2>(out1,out2,in1,in2); + }; + template + static accelerator_inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN<3>(out1,out2,in1,in2); + }; + +}; + +struct Rotate{ + + template static accelerator_inline vec tRotate(vec in){ + return rotate(in, n); + } + + template + static accelerator_inline GpuComplexVector<_N,_datum> rotate_template(GpuComplexVector<_N,_datum> &in, int n) + { + typedef GpuComplexVector<_N,_datum> vec; + vec out; + for(int i=0;i + static accelerator_inline GpuVector<_N,_datum> rotate_template(GpuVector<_N,_datum> &in, int n) + { + typedef GpuVector<_N,_datum> vec; + vec out; + for(int i=0;i + accelerator_inline Grid::ComplexF + Reduce::operator()(GpuVectorCF in) + { + Grid::ComplexF greduce(in.rrrr[0],in.iiii[0]); + for(int i=1;i + accelerator_inline Grid::ComplexD + Reduce::operator()(GpuVectorCD in) + { + Grid::ComplexD greduce(in.rrrr[0],in.iiii[0]); + for(int i=1;i + accelerator_inline Grid::RealF + Reduce::operator()(GpuVectorRF in) + { + RealF ret = in.rrrr[0]; + for(int i=1;i + accelerator_inline Grid::RealD + Reduce::operator()(GpuVectorRD in) + { + RealD ret = in.rrrr[0]; + for(int i=1;i + accelerator_inline Integer + Reduce::operator()(GpuVectorI in) + { + Integer ret = in.rrrr[0]; + for(int i=1;i using ReduceSIMD = Optimization::Reduce; + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Div DivSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index b2c7588f..6f4528c7 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -38,7 +38,7 @@ Author: Peter Boyle #ifdef GRID_HIP #include #endif -#ifdef GRID_SYCL +#if !defined(GRID_CUDA) && !defined(GRID_HIP) namespace Grid { typedef struct { uint16_t x;} half; typedef struct { half x; half y;} half2; @@ -486,7 +486,7 @@ namespace Optimization { struct TimesMinusI{ //Complex single - accelerator_inline GpuVectorCF operator()(GpuVectorCF in,GpuVectorCF dummy){ + accelerator_inline GpuVectorCF operator()(GpuVectorCF in){ typedef GpuVectorCF vec; vec ret; for(int i=0;i friend accelerator_inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { Grid_simd ret; - Grid_simd::conv_t conv; Grid_simd::scalar_type s; - conv.v = v.v; for (int i = 0; i < Nsimd(); i++) { - s = conv.s[i]; - conv.s[i] = func(s); + s = v.getlane(i); + s = func(s); + ret.putlane(s,i); } - ret.v = conv.v; return ret; } template @@ -571,18 +569,14 @@ public: const Grid_simd &x, const Grid_simd &y) { Grid_simd ret; - Grid_simd::conv_t cx; - Grid_simd::conv_t cy; Grid_simd::scalar_type sx,sy; - cx.v = x.v; - cy.v = y.v; for (int i = 0; i < Nsimd(); i++) { - sx = cx.s[i]; - sy = cy.s[i]; - cx.s[i] = func(sx,sy); + sx = x.getlane(i); + sy = y.getlane(i); + sx = func(sx,sy); + ret.putlane(sx,i); } - ret.v = cx.v; return ret; } /////////////////////// @@ -645,15 +639,36 @@ public: /////////////////////////////// // Getting single lanes /////////////////////////////// - accelerator_inline Scalar_type getlane(int lane) { +#ifdef GPU_RRII + template = 0> + accelerator_inline Scalar_type getlane(int lane) const { + return Scalar_type(v.rrrr[lane],v.iiii[lane]); + } + template = 0> + accelerator_inline void putlane(const Scalar_type &_S, int lane){ + v.rrrr[lane] = real(_S); + v.iiii[lane] = imag(_S); + } + template = 0> + accelerator_inline Scalar_type getlane(int lane) const { + return ((S*)&v)[lane]; + } + template = 0> + accelerator_inline void putlane(const S &_S, int lane){ + ((Scalar_type*)&v)[lane] = _S; + } +#else // Can pun to an array of complex + accelerator_inline Scalar_type getlane(int lane) const { return ((Scalar_type*)&v)[lane]; } - accelerator_inline void putlane(const Scalar_type &S, int lane){ ((Scalar_type*)&v)[lane] = S; } +#endif + }; // end of Grid_simd class definition + /////////////////////////////// // Define available types /////////////////////////////// @@ -663,7 +678,7 @@ typedef Grid_simd vRealD; typedef Grid_simd vInteger; typedef Grid_simd vRealH; -#ifdef GPU_VEC +#if defined(GPU_VEC) || defined(GPU_RRII) typedef Grid_simd, SIMD_CHtype> vComplexH; typedef Grid_simd , SIMD_CFtype> vComplexF; typedef Grid_simd , SIMD_CDtype> vComplexD; @@ -763,6 +778,7 @@ accelerator_inline void vsplat(Grid_simd &ret, NotEnableIf, } ////////////////////////// + /////////////////////////////////////////////// // Initialise to 1,0,i for the correct types /////////////////////////////////////////////// @@ -907,34 +923,6 @@ accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, G // ---------------------------------------------- -// Distinguish between complex types and others -template = 0> -accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { - typedef Grid_simd simd; - - simd ret; - simd den; - typename simd::conv_t conv; - - ret = a * conjugate(b) ; - den = b * conjugate(b) ; - - // duplicates real part - auto real_den = toReal(den); - simd zden; - memcpy((void *)&zden.v,(void *)&real_den.v,sizeof(zden)); - ret.v=binary(ret.v, zden.v, DivSIMD()); - return ret; -}; - -// Real/Integer types -template = 0> -accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, DivSIMD()); - return ret; -}; - /////////////////////// // Conjugate /////////////////////// @@ -959,30 +947,29 @@ accelerator_inline Grid_simd adj(const Grid_simd &in) { /////////////////////// template = 0> accelerator_inline void timesMinusI(Grid_simd &ret, const Grid_simd &in) { - ret.v = binary(in.v, ret.v, TimesMinusISIMD()); + ret.v = unary(in.v, TimesMinusISIMD()); } template = 0> accelerator_inline Grid_simd timesMinusI(const Grid_simd &in) { Grid_simd ret; - timesMinusI(ret, in); + ret.v=unary(in.v, TimesMinusISIMD()); return ret; } template = 0> accelerator_inline Grid_simd timesMinusI(const Grid_simd &in) { return in; } - /////////////////////// // timesI /////////////////////// template = 0> accelerator_inline void timesI(Grid_simd &ret, const Grid_simd &in) { - ret.v = binary(in.v, ret.v, TimesISIMD()); + ret.v = unary(in.v, TimesISIMD()); } template = 0> accelerator_inline Grid_simd timesI(const Grid_simd &in) { Grid_simd ret; - timesI(ret, in); + ret.v= unary(in.v, TimesISIMD()); return ret; } template = 0> @@ -990,6 +977,35 @@ accelerator_inline Grid_simd timesI(const Grid_simd &in) { return in; } + +// Distinguish between complex types and others +template = 0> +accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { + typedef Grid_simd simd; + + simd ret; + simd den; + + ret = a * conjugate(b) ; + den = b * conjugate(b) ; + + // duplicates real part + auto real_den = toReal(den); + simd zden; + memcpy((void *)&zden.v,(void *)&real_den.v,sizeof(zden)); + ret.v=binary(ret.v, zden.v, DivSIMD()); + return ret; +}; + +// Real/Integer types +template = 0> +accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, DivSIMD()); + return ret; +}; + + ///////////////////// // Inner, outer ///////////////////// @@ -1021,12 +1037,12 @@ template // must be a real arg accelerator_inline typename toRealMapper::Realified toReal(const Csimd &in) { typedef typename toRealMapper::Realified Rsimd; Rsimd ret; - typename Rsimd::conv_t conv; - memcpy((void *)&conv.v,(void *)&in.v,sizeof(conv.v)); + int j=0; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - conv.s[i + 1] = conv.s[i]; // duplicate (r,r);(r,r);(r,r); etc... + auto s = real(in.getlane(j++)); + ret.putlane(s,i); + ret.putlane(s,i+1); } - memcpy((void *)&ret.v,(void *)&conv.v,sizeof(ret.v)); return ret; } @@ -1039,18 +1055,19 @@ template // must be a real arg accelerator_inline typename toComplexMapper::Complexified toComplex(const Rsimd &in) { typedef typename toComplexMapper::Complexified Csimd; - typename Rsimd::conv_t conv; // address as real - - conv.v = in.v; + typedef typename Csimd::scalar_type scalar_type; + int j=0; + Csimd ret; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == conv.s[i]); + auto rr = in.getlane(i); + auto ri = in.getlane(i+1); + assert(rr==ri); // trap any cases where real was not duplicated // indicating the SIMD grids of real and imag assignment did not correctly // match - conv.s[i + 1] = 0.0; // zero imaginary parts + scalar_type s(rr,0.0); + ret.putlane(s,j++); } - Csimd ret; - memcpy((void *)&ret.v,(void *)&conv.v,sizeof(ret.v)); return ret; } @@ -1146,6 +1163,27 @@ template <> struct is_simd : public std::true_type {}; template using IfSimd = Invoke::value, int> >; template using IfNotSimd = Invoke::value, unsigned> >; +/////////////////////////////////////////////// +// Convenience insert / extract with complex support +/////////////////////////////////////////////// +template +accelerator_inline S getlane(const Grid_simd &in,int lane) { + return in.getlane(lane); +} +template +accelerator_inline void putlane(Grid_simd &vec,const S &_S, int lane){ + vec.putlane(_S,lane); +} +template = 0 > +accelerator_inline S getlane(const S &in,int lane) { + return in; +} +template = 0 > +accelerator_inline void putlane(S &vec,const S &_S, int lane){ + vec = _S; +} + + NAMESPACE_END(Grid); #endif diff --git a/Grid/simd/Simd.h b/Grid/simd/Simd.h index 76ca3bef..9de192bb 100644 --- a/Grid/simd/Simd.h +++ b/Grid/simd/Simd.h @@ -69,6 +69,7 @@ typedef RealF Real; typedef thrust::complex ComplexF; typedef thrust::complex ComplexD; typedef thrust::complex Complex; +typedef thrust::complex ComplexH; template using complex = thrust::complex; accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(thrust::pow(r,(double)y)); } @@ -77,6 +78,7 @@ accelerator_inline ComplexF pow(const ComplexF& r,RealF y){ return(thrust::pow(r typedef std::complex ComplexF; typedef std::complex ComplexD; typedef std::complex Complex; +typedef std::complex ComplexH; // Hack template using complex = std::complex; accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(std::pow(r,y)); } diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 19b5e6ea..e23ff258 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -216,7 +216,6 @@ class CartesianStencil : public CartesianStencilAccelerator View_type; typedef typename View_type::StencilVector StencilVector; @@ -1014,7 +1013,6 @@ public: int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx, int point) { typedef typename cobj::vector_type vector_type; - typedef typename cobj::scalar_type scalar_type; int comms_send = this->_comms_send[point] ; int comms_recv = this->_comms_recv[point] ; diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h index be045ede..f3114cb5 100644 --- a/Grid/tensors/Tensor_class.h +++ b/Grid/tensors/Tensor_class.h @@ -178,6 +178,7 @@ public: stream << "S {" << o._internal << "}"; return stream; }; + // FIXME These will break with change of data layout strong_inline const scalar_type * begin() const { return reinterpret_cast(&_internal); } strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(&_internal); } strong_inline const scalar_type * end() const { return begin() + Traits::count; } @@ -288,6 +289,7 @@ public: // return _internal[i]; // } + // FIXME These will break with change of data layout strong_inline const scalar_type * begin() const { return reinterpret_cast(_internal); } strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(_internal); } strong_inline const scalar_type * end() const { return begin() + Traits::count; } @@ -430,6 +432,7 @@ public: // return _internal[i][j]; // } + // FIXME These will break with change of data layout strong_inline const scalar_type * begin() const { return reinterpret_cast(_internal[0]); } strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(_internal[0]); } strong_inline const scalar_type * end() const { return begin() + Traits::count; } diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h index ab14f81f..c63a2439 100644 --- a/Grid/tensors/Tensor_extract_merge.h +++ b/Grid/tensors/Tensor_extract_merge.h @@ -1,5 +1,5 @@ /************************************************************************************* -n + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/tensors/Tensor_extract_merge.h @@ -62,8 +62,18 @@ void extract(const vobj &vec,ExtractBuffer &extracted) const int words=sizeof(vobj)/sizeof(vector_type); const int Nsimd=vector_type::Nsimd(); const int Nextr=extracted.size(); + vector_type * vp = (vector_type *)&vec; const int s=Nsimd/Nextr; sobj_scalar_type *sp = (sobj_scalar_type *) &extracted[0]; + sobj_scalar_type stmp; + for(int w=0;w &extracted) memcpy((char *)&sp[i*words+w],(char *)&stmp,sizeof(stmp)); } } + */ + return; } @@ -93,7 +105,7 @@ void merge(vobj &vec,ExtractBuffer &extracted) const int s=Nsimd/Nextr; sobj_scalar_type *sp = (sobj_scalar_type *)&extracted[0]; - scalar_type *vp = (scalar_type *)&vec; + vector_type *vp = (vector_type *)&vec; scalar_type vtmp; sobj_scalar_type stmp; for(int w=0;w &extracted) for(int ii=0;ii &extracted, int off const int Nextr=extracted.size(); const int s = Nsimd/Nextr; - scalar_type * vp = (scalar_type *)&vec; + vector_type * vp = (vector_type *)&vec; scalar_type vtmp; sobj_scalar_type stmp; for(int w=0;w &extracted, int offset) const int Nextr=extracted.size(); const int s = Nsimd/Nextr; - scalar_type * vp = (scalar_type *)&vec; + vector_type * vp = (vector_type *)&vec; scalar_type vtmp; sobj_scalar_type stmp; for(int w=0;w Date: Thu, 13 Oct 2022 17:55:50 -0400 Subject: [PATCH 160/240] Hack for lattice sites --- Grid/qcd/action/fermion/WilsonKernels.h | 12 ++-- .../WilsonKernelsImplementation.h | 56 ++++++++++++------- 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 68422f28..c5d48095 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -90,22 +90,22 @@ private: // Specialised variants static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static void AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 939fda33..b5fe63cc 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -61,9 +61,13 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ spProj(chi,tmp); \ } else { \ - chi = coalescedRead(buf[SE->_offset],lane); \ + int s = sF %sU ; \ + chi = Zero(); \ + if ( (s==0)||(s==Ls-1)) { \ + chi = coalescedRead(buf[SE->_offset],lane); \ + } \ } \ - acceleratorSynchronise(); \ + acceleratorSynchronise(); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Recon(result, Uchi); @@ -80,11 +84,14 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ - if (!SE->_is_local ) { \ - auto chi = coalescedRead(buf[SE->_offset],lane); \ - Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ - Recon(result, Uchi); \ - nmu++; \ + if (!SE->_is_local ) { \ + int s = sF %sU ; \ + if ( (s==0)||(s==Ls-1)) { \ + auto chi = coalescedRead(buf[SE->_offset],lane); \ + Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ + Recon(result, Uchi); \ + nmu++; \ + } \ } \ acceleratorSynchronise(); @@ -111,7 +118,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) //////////////////////////////////////////////////////////////////// template accelerator_inline void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -137,7 +144,7 @@ void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV template accelerator_inline void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -166,7 +173,7 @@ void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView //////////////////////////////////////////////////////////////////// template accelerator_inline void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -194,8 +201,8 @@ void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi template accelerator_inline void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out) + SiteHalfSpinor *buf, int Ls, int sF, + int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; @@ -224,7 +231,7 @@ void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeField //////////////////////////////////////////////////////////////////// template accelerator_inline void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -255,7 +262,7 @@ void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi template accelerator_inline void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int sF, + SiteHalfSpinor *buf, int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -420,6 +427,15 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ }); +#define KERNEL_CALLG(A) \ + const uint64_t NN = Nsite*Ls; \ + accelerator_forNB( ss, NN, Simd::Nsimd(), { \ + int sF = ss; \ + int sU = ss/Ls; \ + WilsonKernels::A(st_v,U_v,buf,Ls,sF,sU,in_v,out_v); \ + }); \ + accelerator_barrier(); + #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); #define KERNEL_CALL_EXT(A) \ @@ -450,7 +466,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSite); return;} #ifdef SYCL_HACK if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; } #else @@ -460,13 +476,13 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif } else if( interior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} @@ -485,19 +501,19 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif } else if( interior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif } else if( exterior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} From 8a07b5200960cabc24a3ceefe9c20c331b1c2954 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 13 Oct 2022 18:44:47 -0400 Subject: [PATCH 161/240] Dirichlet --- .../GeneralEvenOddRationalRatio.h | 10 +++- .../pseudofermion/TwoFlavourEvenOddRatio.h | 5 ++ Grid/qcd/hmc/HMC.h | 2 +- Grid/qcd/hmc/integrators/Integrator.h | 42 +++++++++++++++ Grid/simd/Grid_vector_types.h | 53 ------------------- configure.ac | 2 +- systems/Crusher/config-command | 2 + 7 files changed, 59 insertions(+), 57 deletions(-) diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h index 027f37b3..ff605362 100644 --- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h @@ -59,7 +59,7 @@ NAMESPACE_BEGIN(Grid); typedef RationalActionParams Params; Params param; - + RealD RefreshAction; //For action evaluation MultiShiftFunction ApproxPowerAction ; //rational approx for X^{1/inv_pow} MultiShiftFunction ApproxNegPowerAction; //rational approx for X^{-1/inv_pow} @@ -270,12 +270,18 @@ NAMESPACE_BEGIN(Grid); assert(NumOp.ConstEE() == 1); assert(DenOp.ConstEE() == 1); PhiEven = Zero(); - std::cout< &_NumOp, FermionOperator &_DenOp, @@ -132,6 +134,9 @@ NAMESPACE_BEGIN(Grid); // Even det factors DenOp.MooeeDag(etaEven,tmp); NumOp.MooeeInvDag(tmp,PhiEven); + + RefreshAction = norm2(etaEven)+norm2(etaOdd); + std::cout << " refresh " < + void operator()(std::vector*> repr_set, Repr& Rep, int level, RealD& H) { + + for (int a = 0; a < repr_set.size(); ++a) { + RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); + std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; + H += Hterm; + + } + } + } Sinitial_hireps{}; + + RealD Sinitial(Field& U) + { // here also U not used + + std::cout << GridLogIntegrator << "Integrator initial action\n"; + + RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom + + RealD Hterm; + + // Actions + for (int level = 0; level < as.size(); ++level) { + for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { + // get gauge field from the SmearingPolicy and + // based on the boolean is_smeared in actionID + Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); + std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; + as[level].actions.at(actionID)->S_timer_start(); + Hterm = as[level].actions.at(actionID)->Sinitial(Us); + as[level].actions.at(actionID)->S_timer_stop(); + std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; + H += Hterm; + } + as[level].apply(Sinitial_hireps, Representations, level, H); + } + + return H; + } + + void integrate(Field& U) { // reset the clocks diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 42460c48..daf41cae 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -148,23 +148,6 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #endif #endif -/* -#ifdef A64FXVLA -#pragma message("building A64FX VLA") -#if defined(ARMCLANGCOMPAT) - #pragma message("applying data types patch") -#endif -#include -#include "Grid_a64fx-2.h" -#endif - -#ifdef A64FXVLS -#pragma message("building A64FX VLS") -#include -#include "Grid_a64fx-fixedsize.h" -#endif -*/ - #ifdef SSE4 #include "Grid_sse4.h" #endif @@ -1147,42 +1130,6 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc #endif #endif -///////////////////////////////////////// -// Some traits to recognise the types -///////////////////////////////////////// -template -struct is_simd : public std::false_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; -template <> struct is_simd : public std::true_type {}; - -template using IfSimd = Invoke::value, int> >; -template using IfNotSimd = Invoke::value, unsigned> >; - -/////////////////////////////////////////////// -// Convenience insert / extract with complex support -/////////////////////////////////////////////// -template -accelerator_inline S getlane(const Grid_simd &in,int lane) { - return in.getlane(lane); -} -template -accelerator_inline void putlane(Grid_simd &vec,const S &_S, int lane){ - vec.putlane(_S,lane); -} -template = 0 > -accelerator_inline S getlane(const S &in,int lane) { - return in; -} -template = 0 > -accelerator_inline void putlane(S &vec,const S &_S, int lane){ - vec = _S; -} - NAMESPACE_END(Grid); diff --git a/configure.ac b/configure.ac index 9e7b8b80..2e6199c7 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_PREREQ([2.71]) +AC_PREREQ([2.69]) AC_INIT([Grid],[0.7.0],[https://github.com/paboyle/Grid],[Grid]) AC_CANONICAL_BUILD AC_CANONICAL_HOST diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index 854c2c01..57b93e03 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -14,4 +14,6 @@ CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib LDFLAGS="-L/lib64 -L/opt/rocm-5.2.0/lib/ -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 " +#--enable-simd=GPU-RRII \ + From 991667ba5e2a5dd972dd8f813e2a35be74b9adc9 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 13 Oct 2022 18:50:35 -0400 Subject: [PATCH 162/240] Revert --- Grid/qcd/action/fermion/WilsonKernels.h | 12 ++-- .../WilsonKernelsImplementation.h | 56 +++++++------------ 2 files changed, 26 insertions(+), 42 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index c5d48095..68422f28 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -90,22 +90,22 @@ private: // Specialised variants static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static accelerator void GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, - int Ls, int sF, int sU, const FermionFieldView &in, FermionFieldView &out); + int sF, int sU, const FermionFieldView &in, FermionFieldView &out); static void AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf, int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 000b5445..bdba7cb2 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -61,13 +61,9 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ spProj(chi,tmp); \ } else { \ - int s = sF %sU ; \ - chi = Zero(); \ - if ( (s==0)||(s==Ls-1)) { \ - chi = coalescedRead(buf[SE->_offset],lane); \ - } \ + chi = coalescedRead(buf[SE->_offset],lane); \ } \ - acceleratorSynchronise(); \ + acceleratorSynchronise(); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Recon(result, Uchi); @@ -84,14 +80,11 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ - if (!SE->_is_local ) { \ - int s = sF %sU ; \ - if ( (s==0)||(s==Ls-1)) { \ - auto chi = coalescedRead(buf[SE->_offset],lane); \ - Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ - Recon(result, Uchi); \ - nmu++; \ - } \ + if (!SE->_is_local ) { \ + auto chi = coalescedRead(buf[SE->_offset],lane); \ + Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ + Recon(result, Uchi); \ + nmu++; \ } \ acceleratorSynchronise(); @@ -118,7 +111,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) //////////////////////////////////////////////////////////////////// template accelerator_inline void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int Ls, int sF, + SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -144,7 +137,7 @@ void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV template accelerator_inline void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int Ls, int sF, + SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -173,7 +166,7 @@ void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView //////////////////////////////////////////////////////////////////// template accelerator_inline void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int Ls, int sF, + SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -201,8 +194,8 @@ void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi template accelerator_inline void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int Ls, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out) + SiteHalfSpinor *buf, int sF, + int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; @@ -231,7 +224,7 @@ void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeField //////////////////////////////////////////////////////////////////// template accelerator_inline void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int Ls, int sF, + SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -262,7 +255,7 @@ void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi template accelerator_inline void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, - SiteHalfSpinor *buf, int Ls, int sF, + SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; @@ -427,15 +420,6 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ }); -#define KERNEL_CALLG(A) \ - const uint64_t NN = Nsite*Ls; \ - accelerator_forNB( ss, NN, Simd::Nsimd(), { \ - int sF = ss; \ - int sU = ss/Ls; \ - WilsonKernels::A(st_v,U_v,buf,Ls,sF,sU,in_v,out_v); \ - }); \ - accelerator_barrier(); - #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); #define KERNEL_CALL_EXT(A) \ @@ -466,7 +450,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSite); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} #ifdef SYCL_HACK if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; } #else @@ -476,13 +460,13 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif } else if( interior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteInt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteExt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} @@ -501,21 +485,21 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteDag); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif acceleratorFenceComputeStream(); } else if( interior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteDagInt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif } else if( exterior ) { acceleratorFenceComputeStream(); - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLG(GenericDhopSiteDagExt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} From 2e8c3b0ddb64b294bc061741164b1a399de6416c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 18 Oct 2022 18:10:01 -0400 Subject: [PATCH 163/240] Slow implementation of Shamir DWF --- Grid/qcd/action/fermion/DWFSlow.h | 291 ++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 Grid/qcd/action/fermion/DWFSlow.h diff --git a/Grid/qcd/action/fermion/DWFSlow.h b/Grid/qcd/action/fermion/DWFSlow.h new file mode 100644 index 00000000..61298504 --- /dev/null +++ b/Grid/qcd/action/fermion/DWFSlow.h @@ -0,0 +1,291 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/DWFSlow.h + +Copyright (C) 2022 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +class DWFSlowFermion : public FermionOperator +{ +public: + INHERIT_IMPL_TYPES(Impl); + + /////////////////////////////////////////////////////////////// + // Implement the abstract base + /////////////////////////////////////////////////////////////// + GridBase *GaugeGrid(void) { return _grid4; } + GridBase *GaugeRedBlackGrid(void) { return _cbgrid4; } + GridBase *FermionGrid(void) { return _grid; } + GridBase *FermionRedBlackGrid(void) { return _cbgrid; } + + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + + ////////////////////////////////////////////////////////////////// + // override multiply; cut number routines if pass dagger argument + // and also make interface more uniformly consistent + ////////////////////////////////////////////////////////////////// + virtual void M(const FermionField &in, FermionField &out) + { + FermionField tmp(_grid); + out = (5.0 - M5) * in; + Dhop(in,tmp,DaggerNo); + out = out + tmp; + } + virtual void Mdag(const FermionField &in, FermionField &out) + { + FermionField tmp(_grid); + out = (5.0 - M5) * in; + Dhop(in,tmp,DaggerYes); + out = out + tmp; + }; + + ///////////////////////////////////////////////////////// + // half checkerboard operations 5D redblack so just site identiy + ///////////////////////////////////////////////////////// + void Meooe(const FermionField &in, FermionField &out) + { + if ( in.Checkerboard() == Odd ) { + this->DhopEO(in,out,DaggerNo); + } else { + this->DhopOE(in,out,DaggerNo); + } + } + void MeooeDag(const FermionField &in, FermionField &out) + { + if ( in.Checkerboard() == Odd ) { + this->DhopEO(in,out,DaggerYes); + } else { + this->DhopOE(in,out,DaggerYes); + } + }; + + // allow override for twisted mass and clover + virtual void Mooee(const FermionField &in, FermionField &out) + { + out = (5.0 - M5) * in; + } + virtual void MooeeDag(const FermionField &in, FermionField &out) + { + out = (5.0 - M5) * in; + } + virtual void MooeeInv(const FermionField &in, FermionField &out) + { + out = (1.0/(5.0 - M5)) * in; + }; + virtual void MooeeInvDag(const FermionField &in, FermionField &out) + { + out = (1.0/(5.0 - M5)) * in; + }; + + virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector twist) {} ; + + //////////////////////// + // Derivative interface + //////////////////////// + // Interface calls an internal routine + void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag) { assert(0);}; + void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);}; + void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);}; + + /////////////////////////////////////////////////////////////// + // non-hermitian hopping term; half cb or both + /////////////////////////////////////////////////////////////// + void Dhop(const FermionField &in, FermionField &out, int dag) + { + FermionField tmp(in.Grid()); + Dhop5(in,out,MassField,MassField,dag ); + for(int mu=0;mu<4;mu++){ + DhopDirU(in,Umu[mu],Umu[mu],tmp,mu,dag ); out = out + tmp; + } + }; + void DhopOE(const FermionField &in, FermionField &out, int dag) + { + FermionField tmp(in.Grid()); + assert(in.Checkerboard()==Even); + Dhop5(in,out,MassFieldOdd,MassFieldEven,dag); + for(int mu=0;mu<4;mu++){ + DhopDirU(in,UmuOdd[mu],UmuEven[mu],tmp,mu,dag ); out = out + tmp; + } + }; + void DhopEO(const FermionField &in, FermionField &out, int dag) + { + FermionField tmp(in.Grid()); + assert(in.Checkerboard()==Odd); + Dhop5(in,out, MassFieldEven,MassFieldOdd ,dag ); + for(int mu=0;mu<4;mu++){ + DhopDirU(in,UmuEven[mu],UmuOdd[mu],tmp,mu,dag ); out = out + tmp; + } + }; + + /////////////////////////////////////////////////////////////// + // Multigrid assistance; force term uses too + /////////////////////////////////////////////////////////////// + void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ assert(0);}; + void MdirAll(const FermionField &in, std::vector &out) { assert(0);}; + void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { assert(0);}; + void DhopDirAll(const FermionField &in, std::vector &out) { assert(0);}; + void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { assert(0);}; + + void DhopDirU(const FermionField &in, const GaugeLinkField &U5e, const GaugeLinkField &U5o, FermionField &out, int mu, int dag) + { + RealD sgn= 1.0; + if (dag ) sgn=-1.0; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + // mass is 1,1,1,1,-m has to multiply the round the world term + FermionField tmp (in.Grid()); + tmp = U5e * Cshift(in,mu+1,1); + out = tmp - Gamma(Gmu[mu])*tmp*sgn; + + tmp = Cshift(adj(U5o)*in,mu+1,-1); + out = out + tmp + Gamma(Gmu[mu])*tmp*sgn; + + out = -0.5*out; + }; + + void Dhop5(const FermionField &in, FermionField &out, ComplexField &massE, ComplexField &massO, int dag) + { + // Mass term.... must multiple the round world with mass = 1,1,1,1, -m + RealD sgn= 1.0; + if (dag ) sgn=-1.0; + + Gamma G5(Gamma::Algebra::Gamma5); + + FermionField tmp (in.Grid()); + tmp = massE*Cshift(in,0,1); + out = tmp - G5*tmp*sgn; + + tmp = Cshift(massO*in,0,-1); + out = out + tmp + G5*tmp*sgn; + out = -0.5*out; + }; + + // Constructor + DWFSlowFermion(GaugeField &_Umu, GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, RealD _M5) + : + _grid(&Fgrid), + _cbgrid(&Hgrid), + _grid4(_Umu.Grid()), + Umu(Nd,&Fgrid), + UmuEven(Nd,&Hgrid), + UmuOdd(Nd,&Hgrid), + MassField(&Fgrid), + MassFieldEven(&Hgrid), + MassFieldOdd(&Hgrid), + M5(_M5), + mass(_mass), + _tmp(&Hgrid) + { + Ls=Fgrid._fdimensions[0]; + ImportGauge(_Umu); + + typedef typename FermionField::scalar_type scalar; + + Lattice > coor(&Fgrid); + LatticeCoordinate(coor, 0); // Scoor + ComplexField one(&Fgrid); + MassField =scalar(-mass); + one =scalar(1.0); + MassField =where(coor==Integer(Ls-1),MassField,one); + for(int mu=0;mu(_Umu4, mu); + for(int s=0;sLs;s++){ + InsertSlice(U4,Umu[mu],s,0); + } + } + } + + /////////////////////////////////////////////////////////////// + // Data members require to support the functionality + /////////////////////////////////////////////////////////////// + +public: + virtual RealD Mass(void) { return mass; } + virtual int isTrivialEE(void) { return 1; }; + RealD mass; + RealD M5; + int Ls; + + GridBase *_grid4; + GridBase *_grid; + GridBase *_cbgrid4; + GridBase *_cbgrid; + + // Copy of the gauge field , with even and odd subsets + std::vector Umu; + std::vector UmuEven; + std::vector UmuOdd; + ComplexField MassField; + ComplexField MassFieldEven; + ComplexField MassFieldOdd; + + /////////////////////////////////////////////////////////////// + // Conserved current utilities + /////////////////////////////////////////////////////////////// + void ContractConservedCurrent(PropagatorField &q_in_1, + PropagatorField &q_in_2, + PropagatorField &q_out, + PropagatorField &phys_src, + Current curr_type, + unsigned int mu){} + void SeqConservedCurrent(PropagatorField &q_in, + PropagatorField &q_out, + PropagatorField &phys_src, + Current curr_type, + unsigned int mu, + unsigned int tmin, + unsigned int tmax, + ComplexField &lattice_cmplx){} +}; + +typedef DWFSlowFermion DWFSlowFermionF; +typedef DWFSlowFermion DWFSlowFermionD; + +NAMESPACE_END(Grid); From 132d841b05217bf5e8efca1ef378c8505b700773 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 2 Nov 2022 19:33:22 -0400 Subject: [PATCH 164/240] Compile fix --- Grid/tensors/Tensor_extract_merge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h index c63a2439..87572faf 100644 --- a/Grid/tensors/Tensor_extract_merge.h +++ b/Grid/tensors/Tensor_extract_merge.h @@ -130,7 +130,7 @@ typename vobj::scalar_object extractLane(int lane, const vobj & __restrict__ vec typedef typename vobj::scalar_object scalar_object; typedef typename vobj::vector_type vector_type; typedef typename ExtractTypeMap::extract_type extract_type; - typedef extract_type * pointer; + typedef scalar_type * pointer; constexpr int words=sizeof(vobj)/sizeof(vector_type); constexpr int Nsimd=vector_type::Nsimd(); @@ -150,7 +150,7 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob typedef typename vobj::vector_type vector_type; typedef typename vector_type::scalar_type scalar_type; typedef typename ExtractTypeMap::extract_type extract_type; - typedef extract_type * pointer; + typedef scalar_type * pointer; constexpr int words=sizeof(vobj)/sizeof(vector_type); constexpr int Nsimd=vector_type::Nsimd(); From eae1c02111a03026dcfceb6ef8e8b6cf2a08b334 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 2 Nov 2022 19:50:32 -0400 Subject: [PATCH 165/240] Bounds check --- HMC/Mobius2p1f_DD_RHMC_96I.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/HMC/Mobius2p1f_DD_RHMC_96I.cc b/HMC/Mobius2p1f_DD_RHMC_96I.cc index 7ba5f5b5..967a3fc9 100644 --- a/HMC/Mobius2p1f_DD_RHMC_96I.cc +++ b/HMC/Mobius2p1f_DD_RHMC_96I.cc @@ -299,6 +299,7 @@ int main(int argc, char **argv) { // MDCG(SdagS,vec,res); + vec = 1; // Fill with any old junk std::cout << "Bounds check on light quark operator mass "<< Denominators[0]->Mass() < UdagU(*Denominators[0]); HighBoundCheck(UdagU,vec,OFRp.hi); @@ -308,14 +309,16 @@ int main(int argc, char **argv) { // MDCG(UdagU,vec,res); + vec = 1; // Fill with any old junk std::cout << "Bounds check on strange dirichlet operator mass "<< StrangeOpDir.Mass()< SddagSd(StrangeOpDir); - HighBoundCheck(SddagSd,vec,OFRp.hi); - ChebyBoundsCheck(SddagSd,vec,OFRp.lo,OFRp.hi); + HighBoundCheck(SddagSd,vec,SFRp.hi); + ChebyBoundsCheck(SddagSd,vec,SFRp.lo,SFRp.hi); std::cout << "strange dirichlet inversion"<Mass()< UddagUd(*Numerators[0]); HighBoundCheck(UddagUd,vec,OFRp.hi); @@ -337,7 +340,7 @@ int main(int argc, char **argv) { const int MaxIt= 10000; int Nconv; RealD resid = 1.0e-5; - if(0) + if(1) { int order = 501; RealD bound = 5.0e-4; From d9dd9a5b5f3f7eb797acc4e9ea8f1e20c2e4b843 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 2 Nov 2022 19:51:50 -0400 Subject: [PATCH 166/240] LLVM update --- systems/Crusher/sourceme.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/Crusher/sourceme.sh b/systems/Crusher/sourceme.sh index 3cccb10a..42e15fb1 100644 --- a/systems/Crusher/sourceme.sh +++ b/systems/Crusher/sourceme.sh @@ -2,7 +2,7 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sle module load emacs #module load gperftools module load PrgEnv-gnu -module load rocm/5.2.0 +module load rocm/5.3.0 module load cray-mpich/8.1.16 #module load cray-mpich/8.1.17 module load gmp From a3927a8a276ff499bd9d9a3b964e80a7eb16b646 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 2 Nov 2022 20:22:27 -0400 Subject: [PATCH 167/240] Dirichlet --- tests/Test_cayley_even_odd_vec.cc | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/tests/Test_cayley_even_odd_vec.cc b/tests/Test_cayley_even_odd_vec.cc index 9524fea0..e25d6fde 100644 --- a/tests/Test_cayley_even_odd_vec.cc +++ b/tests/Test_cayley_even_odd_vec.cc @@ -29,19 +29,6 @@ Author: Peter Boyle using namespace std; using namespace Grid; - ; - -template -struct scal { - d internal; -}; - - Gamma::Algebra Gmu [] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT - }; template void TestWhat(What & Ddwf, @@ -88,7 +75,12 @@ int main (int argc, char ** argv) std::cout< boundary = {1,1,1,-1}; + DomainWallFermionD::ImplParams Params(boundary); + Coordinate Dirichlet({0,8,8,16,32}); + Params.dirichlet=Dirichlet; + + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params); TestWhat(Ddwf,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5); RealD b=1.5;// Scale factor b+c=2, b-c=1 From 78acae9b50c416fe26452cee818f3cda10752e0d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 2 Nov 2022 20:24:17 -0400 Subject: [PATCH 168/240] Simple DWF for easy check --- Grid/qcd/action/fermion/DWFSlow.h | 291 ++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 Grid/qcd/action/fermion/DWFSlow.h diff --git a/Grid/qcd/action/fermion/DWFSlow.h b/Grid/qcd/action/fermion/DWFSlow.h new file mode 100644 index 00000000..61298504 --- /dev/null +++ b/Grid/qcd/action/fermion/DWFSlow.h @@ -0,0 +1,291 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/DWFSlow.h + +Copyright (C) 2022 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +class DWFSlowFermion : public FermionOperator +{ +public: + INHERIT_IMPL_TYPES(Impl); + + /////////////////////////////////////////////////////////////// + // Implement the abstract base + /////////////////////////////////////////////////////////////// + GridBase *GaugeGrid(void) { return _grid4; } + GridBase *GaugeRedBlackGrid(void) { return _cbgrid4; } + GridBase *FermionGrid(void) { return _grid; } + GridBase *FermionRedBlackGrid(void) { return _cbgrid; } + + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + + ////////////////////////////////////////////////////////////////// + // override multiply; cut number routines if pass dagger argument + // and also make interface more uniformly consistent + ////////////////////////////////////////////////////////////////// + virtual void M(const FermionField &in, FermionField &out) + { + FermionField tmp(_grid); + out = (5.0 - M5) * in; + Dhop(in,tmp,DaggerNo); + out = out + tmp; + } + virtual void Mdag(const FermionField &in, FermionField &out) + { + FermionField tmp(_grid); + out = (5.0 - M5) * in; + Dhop(in,tmp,DaggerYes); + out = out + tmp; + }; + + ///////////////////////////////////////////////////////// + // half checkerboard operations 5D redblack so just site identiy + ///////////////////////////////////////////////////////// + void Meooe(const FermionField &in, FermionField &out) + { + if ( in.Checkerboard() == Odd ) { + this->DhopEO(in,out,DaggerNo); + } else { + this->DhopOE(in,out,DaggerNo); + } + } + void MeooeDag(const FermionField &in, FermionField &out) + { + if ( in.Checkerboard() == Odd ) { + this->DhopEO(in,out,DaggerYes); + } else { + this->DhopOE(in,out,DaggerYes); + } + }; + + // allow override for twisted mass and clover + virtual void Mooee(const FermionField &in, FermionField &out) + { + out = (5.0 - M5) * in; + } + virtual void MooeeDag(const FermionField &in, FermionField &out) + { + out = (5.0 - M5) * in; + } + virtual void MooeeInv(const FermionField &in, FermionField &out) + { + out = (1.0/(5.0 - M5)) * in; + }; + virtual void MooeeInvDag(const FermionField &in, FermionField &out) + { + out = (1.0/(5.0 - M5)) * in; + }; + + virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector twist) {} ; + + //////////////////////// + // Derivative interface + //////////////////////// + // Interface calls an internal routine + void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag) { assert(0);}; + void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);}; + void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);}; + + /////////////////////////////////////////////////////////////// + // non-hermitian hopping term; half cb or both + /////////////////////////////////////////////////////////////// + void Dhop(const FermionField &in, FermionField &out, int dag) + { + FermionField tmp(in.Grid()); + Dhop5(in,out,MassField,MassField,dag ); + for(int mu=0;mu<4;mu++){ + DhopDirU(in,Umu[mu],Umu[mu],tmp,mu,dag ); out = out + tmp; + } + }; + void DhopOE(const FermionField &in, FermionField &out, int dag) + { + FermionField tmp(in.Grid()); + assert(in.Checkerboard()==Even); + Dhop5(in,out,MassFieldOdd,MassFieldEven,dag); + for(int mu=0;mu<4;mu++){ + DhopDirU(in,UmuOdd[mu],UmuEven[mu],tmp,mu,dag ); out = out + tmp; + } + }; + void DhopEO(const FermionField &in, FermionField &out, int dag) + { + FermionField tmp(in.Grid()); + assert(in.Checkerboard()==Odd); + Dhop5(in,out, MassFieldEven,MassFieldOdd ,dag ); + for(int mu=0;mu<4;mu++){ + DhopDirU(in,UmuEven[mu],UmuOdd[mu],tmp,mu,dag ); out = out + tmp; + } + }; + + /////////////////////////////////////////////////////////////// + // Multigrid assistance; force term uses too + /////////////////////////////////////////////////////////////// + void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ assert(0);}; + void MdirAll(const FermionField &in, std::vector &out) { assert(0);}; + void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { assert(0);}; + void DhopDirAll(const FermionField &in, std::vector &out) { assert(0);}; + void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { assert(0);}; + + void DhopDirU(const FermionField &in, const GaugeLinkField &U5e, const GaugeLinkField &U5o, FermionField &out, int mu, int dag) + { + RealD sgn= 1.0; + if (dag ) sgn=-1.0; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + // mass is 1,1,1,1,-m has to multiply the round the world term + FermionField tmp (in.Grid()); + tmp = U5e * Cshift(in,mu+1,1); + out = tmp - Gamma(Gmu[mu])*tmp*sgn; + + tmp = Cshift(adj(U5o)*in,mu+1,-1); + out = out + tmp + Gamma(Gmu[mu])*tmp*sgn; + + out = -0.5*out; + }; + + void Dhop5(const FermionField &in, FermionField &out, ComplexField &massE, ComplexField &massO, int dag) + { + // Mass term.... must multiple the round world with mass = 1,1,1,1, -m + RealD sgn= 1.0; + if (dag ) sgn=-1.0; + + Gamma G5(Gamma::Algebra::Gamma5); + + FermionField tmp (in.Grid()); + tmp = massE*Cshift(in,0,1); + out = tmp - G5*tmp*sgn; + + tmp = Cshift(massO*in,0,-1); + out = out + tmp + G5*tmp*sgn; + out = -0.5*out; + }; + + // Constructor + DWFSlowFermion(GaugeField &_Umu, GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, RealD _M5) + : + _grid(&Fgrid), + _cbgrid(&Hgrid), + _grid4(_Umu.Grid()), + Umu(Nd,&Fgrid), + UmuEven(Nd,&Hgrid), + UmuOdd(Nd,&Hgrid), + MassField(&Fgrid), + MassFieldEven(&Hgrid), + MassFieldOdd(&Hgrid), + M5(_M5), + mass(_mass), + _tmp(&Hgrid) + { + Ls=Fgrid._fdimensions[0]; + ImportGauge(_Umu); + + typedef typename FermionField::scalar_type scalar; + + Lattice > coor(&Fgrid); + LatticeCoordinate(coor, 0); // Scoor + ComplexField one(&Fgrid); + MassField =scalar(-mass); + one =scalar(1.0); + MassField =where(coor==Integer(Ls-1),MassField,one); + for(int mu=0;mu(_Umu4, mu); + for(int s=0;sLs;s++){ + InsertSlice(U4,Umu[mu],s,0); + } + } + } + + /////////////////////////////////////////////////////////////// + // Data members require to support the functionality + /////////////////////////////////////////////////////////////// + +public: + virtual RealD Mass(void) { return mass; } + virtual int isTrivialEE(void) { return 1; }; + RealD mass; + RealD M5; + int Ls; + + GridBase *_grid4; + GridBase *_grid; + GridBase *_cbgrid4; + GridBase *_cbgrid; + + // Copy of the gauge field , with even and odd subsets + std::vector Umu; + std::vector UmuEven; + std::vector UmuOdd; + ComplexField MassField; + ComplexField MassFieldEven; + ComplexField MassFieldOdd; + + /////////////////////////////////////////////////////////////// + // Conserved current utilities + /////////////////////////////////////////////////////////////// + void ContractConservedCurrent(PropagatorField &q_in_1, + PropagatorField &q_in_2, + PropagatorField &q_out, + PropagatorField &phys_src, + Current curr_type, + unsigned int mu){} + void SeqConservedCurrent(PropagatorField &q_in, + PropagatorField &q_out, + PropagatorField &phys_src, + Current curr_type, + unsigned int mu, + unsigned int tmin, + unsigned int tmax, + ComplexField &lattice_cmplx){} +}; + +typedef DWFSlowFermion DWFSlowFermionF; +typedef DWFSlowFermion DWFSlowFermionD; + +NAMESPACE_END(Grid); From 006268f55624709aea0d353cbaf0d5875315b4b5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 2 Nov 2022 20:24:51 -0400 Subject: [PATCH 169/240] DWF Slow version --- Grid/qcd/action/fermion/Fermion.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index 2f3fb9f2..bad736cf 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -47,6 +47,7 @@ Author: Peter Boyle //////////////////////////////////////////// // Fermion operators / actions //////////////////////////////////////////// +#include // Slow DWF #include // 4d wilson like NAMESPACE_CHECK(Wilson); From a11c12e2e716cafb54b40fe45405e56144edea14 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 16:20:01 -0500 Subject: [PATCH 170/240] Modifications for partial dirichlet BCs --- Grid/communicator/Communicator_base.h | 2 +- Grid/communicator/Communicator_mpi3.cc | 18 +++++++----------- Grid/communicator/Communicator_none.cc | 2 +- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 68cd36cc..b98424a1 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -155,7 +155,7 @@ public: int xmit_to_rank,int do_xmit, void *recv, int recv_from_rank,int do_recv, - int bytes,int dir); + int xbytes,int rbytes,int dir); void StencilSendToRecvFromComplete(std::vector &waitall,int i); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index cbdd224d..892e3dbe 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -343,7 +343,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int bytes,int dir) { std::vector list; - double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,dir); + double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); StencilSendToRecvFromComplete(list,dir); return offbytes; } @@ -353,7 +353,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); assert(shm!=NULL); - acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); } } - // if ( CommunicatorPolicy == CommunicatorPolicySequential ) { - // this->StencilSendToRecvFromComplete(list,dir); - // } - return off_node_bytes; } void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 4b533c4b..5d917b79 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -126,7 +126,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector Date: Tue, 15 Nov 2022 16:21:24 -0500 Subject: [PATCH 171/240] Partial Dirichlet --- Grid/qcd/action/ActionParams.h | 11 +- Grid/qcd/action/fermion/WilsonCompressor.h | 330 +++++++++++------- .../WilsonFermion5DImplementation.h | 13 +- 3 files changed, 223 insertions(+), 131 deletions(-) diff --git a/Grid/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h index 274ff318..b2a06280 100644 --- a/Grid/qcd/action/ActionParams.h +++ b/Grid/qcd/action/ActionParams.h @@ -39,29 +39,38 @@ struct GparityWilsonImplParams { Coordinate twists; //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs Coordinate dirichlet; // Blocksize of dirichlet BCs - GparityWilsonImplParams() : twists(Nd, 0) { dirichlet.resize(0); }; + int partialDirichlet; + GparityWilsonImplParams() : twists(Nd, 0) { + dirichlet.resize(0); + partialDirichlet=0; + }; }; struct WilsonImplParams { bool overlapCommsCompute; Coordinate dirichlet; // Blocksize of dirichlet BCs + int partialDirichlet; AcceleratorVector twist_n_2pi_L; AcceleratorVector boundary_phases; WilsonImplParams() { dirichlet.resize(0); + partialDirichlet=0; boundary_phases.resize(Nd, 1.0); twist_n_2pi_L.resize(Nd, 0.0); }; WilsonImplParams(const AcceleratorVector phi) : boundary_phases(phi), overlapCommsCompute(false) { twist_n_2pi_L.resize(Nd, 0.0); + partialDirichlet=0; dirichlet.resize(0); } }; struct StaggeredImplParams { Coordinate dirichlet; // Blocksize of dirichlet BCs + int partialDirichlet; StaggeredImplParams() { + partialDirichlet=0; dirichlet.resize(0); }; }; diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index de2f1979..e2ced552 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -32,17 +32,182 @@ Author: paboyle NAMESPACE_BEGIN(Grid); +/////////////////////////////////////////////////////////////// +// Wilson compressor will need FaceGather policies for: +// Periodic, Dirichlet, and partial Dirichlet for DWF +/////////////////////////////////////////////////////////////// +class FaceGatherPartialDWF +{ +public: + static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/2;}; + // static int PartialCompressionFactor(GridBase *grid) { return 1;} + template + static void Gather_plane_simple (commVector >& table, + const Lattice &rhs, + cobj *buffer, + compressor &compress, + int off,int so,int partial) + { + //DWF only hack: If a direction that is OFF node we use Partial Dirichlet + // Shrinks local and remote comms buffers + GridBase *Grid = rhs.Grid(); + int Ls = Grid->_rdimensions[0]; + std::pair *table_v = & table[0]; + auto rhs_v = rhs.View(AcceleratorRead); + int vol=table.size()/Ls; + accelerator_forNB( idx,table.size(), vobj::Nsimd(), { + Integer i=idx/Ls; + Integer s=idx%Ls; + if(s==0) compress.Compress(buffer[off+i ],rhs_v[so+table_v[idx].second]); + if(s==Ls-1) compress.Compress(buffer[off+i+vol],rhs_v[so+table_v[idx].second]); + }); + } + template + static void DecompressFace(decompressor decompress,Decompression &dd) + { + auto Ls = dd.dims[0]; + // Just pass in the Grid + auto kp = dd.kernel_p; + auto mp = dd.mpi_p; + int size= dd.buffer_size; + int vol= size/Ls; + accelerator_forNB(o,size,1,{ + int idx=o/Ls; + int s=o%Ls; + if ( s == 0 ) { + int oo=idx; + kp[o]=mp[oo]; + } else if ( s == Ls-1 ) { + int oo=vol+idx; + kp[o]=mp[oo]; + } else { + kp[o] = Zero();//fill rest with zero if partial dirichlet + } + }); + } + //////////////////////////////////////////////////////////////////////////////////////////// + // Need to gather *interior portions* for ALL s-slices in simd directions + // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side + // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. + //////////////////////////////////////////////////////////////////////////////////////////// + template + static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + Vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type,int partial) + { + GridBase *Grid = rhs.Grid(); + int Ls = Grid->_rdimensions[0]; + + // insertion of zeroes... + assert( (table.size()&0x1)==0); + int num=table.size()/2; + int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + + auto rhs_v = rhs.View(AcceleratorRead); + auto p0=&pointers[0][0]; + auto p1=&pointers[1][0]; + auto tp=&table[0]; + int nnum=num/Ls; + accelerator_forNB(j, num, vobj::Nsimd(), { + // Reorders both local and remote comms buffers + // + int s = j % Ls; + int sp1 = (s+1)%Ls; // peri incremented s slice + + int hxyz= j/Ls; + + int xyz0= hxyz*2; // xyzt part of coor + int xyz1= hxyz*2+1; + + int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... + + int kk0= xyz0*Ls + s ; // s=0 goes to s=1 + int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0 + compress.CompressExchange(p0[jj],p1[jj], + rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites + rhs_v[so+tp[kk1 ].second], + type); + }); + rhs_v.ViewClose(); + } + // Merge routine is for SIMD faces + template + static void MergeFace(decompressor decompress,Merger &mm) + { + auto Ls = mm.dims[0]; + int num= mm.buffer_size/2; // relate vol and Ls to buffer size + auto mp = &mm.mpointer[0]; + auto vp0= &mm.vpointers[0][0]; // First arg is exchange first + auto vp1= &mm.vpointers[1][0]; + auto type= mm.type; + int nnum = num/Ls; + accelerator_forNB(o,num,vobj::Nsimd(),{ + + int s=o%Ls; + int hxyz=o/Ls; // xyzt related component + int xyz0=hxyz*2; + int xyz1=hxyz*2+1; + + int sp = (s+1)%Ls; + int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... + + int oo0= s+xyz0*Ls; + int oo1= s+xyz1*Ls; + + // same ss0, ss1 pair goes to new layout + decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type); + }); + } +}; +class FaceGatherDWFMixedBCs +{ +public: + static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/2;}; + + template + static void Gather_plane_simple (commVector >& table, + const Lattice &rhs, + cobj *buffer, + compressor &compress, + int off,int so,int partial) + { + if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial); + else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial); + } + template + static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + Vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type,int partial) + { + if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial); + else FaceGatherSimple::Gather_plane_exchange (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial); + } + template + static void MergeFace(decompressor decompress,Merger &mm) + { + int partial = mm.partial; + if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm); + else FaceGatherSimple::MergeFace(decompress,mm); + } + + template + static void DecompressFace(decompressor decompress,Decompression &dd) + { + int partial = dd.partial; + if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd); + else FaceGatherSimple::DecompressFace(decompress,dd); + } +}; + ///////////////////////////////////////////////////////////////////////////////////////////// -// optimised versions supporting half precision too +// optimised versions supporting half precision too??? Deprecate ///////////////////////////////////////////////////////////////////////////////////////////// -template -class WilsonCompressorTemplate; - +//Could make FaceGather a template param, but then behaviour is runtime not compile time template -class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, - typename std::enable_if::value>::type > +class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs +// : public FaceGatherSimple { public: @@ -79,172 +244,81 @@ public: /*****************************************************/ /* Exchange includes precision change if mpi data is not same */ /*****************************************************/ - accelerator_inline void Exchange(SiteHalfSpinor *mp, - const SiteHalfSpinor * __restrict__ vp0, - const SiteHalfSpinor * __restrict__ vp1, - Integer type,Integer o) const { + accelerator_inline void Exchange(SiteHalfSpinor &mp0, + SiteHalfSpinor &mp1, + const SiteHalfSpinor & vp0, + const SiteHalfSpinor & vp1, + Integer type) const { #ifdef GRID_SIMT - exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + exchangeSIMT(mp0,mp1,vp0,vp1,type); #else SiteHalfSpinor tmp1; SiteHalfSpinor tmp2; - exchange(tmp1,tmp2,vp0[o],vp1[o],type); - vstream(mp[2*o ],tmp1); - vstream(mp[2*o+1],tmp2); + exchange(tmp1,tmp2,vp0,vp1,type); + vstream(mp0,tmp1); + vstream(mp1,tmp2); #endif } - + /*****************************************************/ /* Have a decompression step if mpi data is not same */ /*****************************************************/ - accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out, - SiteHalfSpinor * __restrict__ in, Integer o) const { - assert(0); + accelerator_inline void Decompress(SiteHalfSpinor &out, + SiteHalfSpinor &in) const { + out = in; } /*****************************************************/ /* Compress Exchange */ /*****************************************************/ - accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0, - SiteHalfSpinor * __restrict__ out1, - const SiteSpinor * __restrict__ in, - Integer j,Integer k, Integer m,Integer type) const + accelerator_inline void CompressExchange(SiteHalfSpinor &out0, + SiteHalfSpinor &out1, + const SiteSpinor &in0, + const SiteSpinor &in1, + Integer type) const { #ifdef GRID_SIMT typedef SiteSpinor vobj; typedef SiteHalfSpinor hvobj; - typedef decltype(coalescedRead(*in)) sobj; - typedef decltype(coalescedRead(*out0)) hsobj; + typedef decltype(coalescedRead(in0)) sobj; + typedef decltype(coalescedRead(out0)) hsobj; unsigned int Nsimd = vobj::Nsimd(); unsigned int mask = Nsimd >> (type + 1); int lane = acceleratorSIMTlane(Nsimd); int j0 = lane &(~mask); // inner coor zero int j1 = lane |(mask) ; // inner coor one - const vobj *vp0 = &in[k]; - const vobj *vp1 = &in[m]; + const vobj *vp0 = &in0; + const vobj *vp1 = &in1; const vobj *vp = (lane&mask) ? vp1:vp0; auto sa = coalescedRead(*vp,j0); auto sb = coalescedRead(*vp,j1); hsobj psa, psb; projector::Proj(psa,sa,mu,dag); projector::Proj(psb,sb,mu,dag); - coalescedWrite(out0[j],psa); - coalescedWrite(out1[j],psb); + coalescedWrite(out0,psa); + coalescedWrite(out1,psb); #else SiteHalfSpinor temp1, temp2; SiteHalfSpinor temp3, temp4; - projector::Proj(temp1,in[k],mu,dag); - projector::Proj(temp2,in[m],mu,dag); + projector::Proj(temp1,in0,mu,dag); + projector::Proj(temp2,in1,mu,dag); exchange(temp3,temp4,temp1,temp2,type); - vstream(out0[j],temp3); - vstream(out1[j],temp4); + vstream(out0,temp3); + vstream(out1,temp4); #endif } /*****************************************************/ /* Pass the info to the stencil */ /*****************************************************/ - accelerator_inline bool DecompressionStep(void) const { return false; } + accelerator_inline bool DecompressionStep(void) const { + return false; + } }; -#if 0 -template -class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, - typename std::enable_if::value>::type > -{ -public: - - int mu,dag; - - void Point(int p) { mu=p; }; - - WilsonCompressorTemplate(int _dag=0){ - dag = _dag; - } - - typedef _Spinor SiteSpinor; - typedef _Hspinor SiteHalfSpinor; - typedef _HCspinor SiteHalfCommSpinor; - typedef typename SiteHalfCommSpinor::vector_type vComplexLow; - typedef typename SiteHalfSpinor::vector_type vComplexHigh; - constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - - accelerator_inline int CommDatumSize(void) const { - return sizeof(SiteHalfCommSpinor); - } - - /*****************************************************/ - /* Compress includes precision change if mpi data is not same */ - /*****************************************************/ - accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const { - SiteHalfSpinor hsp; - SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; - projector::Proj(hsp,in,mu,dag); - precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw); - } - accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const { -#ifdef GRID_SIMT - typedef decltype(coalescedRead(buf)) sobj; - sobj sp; - auto sin = coalescedRead(in); - projector::Proj(sp,sin,mu,dag); - coalescedWrite(buf,sp); -#else - projector::Proj(buf,in,mu,dag); -#endif - } - - /*****************************************************/ - /* Exchange includes precision change if mpi data is not same */ - /*****************************************************/ - accelerator_inline void Exchange(SiteHalfSpinor *mp, - SiteHalfSpinor *vp0, - SiteHalfSpinor *vp1, - Integer type,Integer o) const { - SiteHalfSpinor vt0,vt1; - SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0; - SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1; - precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw); - precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw); - exchange(mp[2*o],mp[2*o+1],vt0,vt1,type); - } - - /*****************************************************/ - /* Have a decompression step if mpi data is not same */ - /*****************************************************/ - accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const { - SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in; - precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw); - } - - /*****************************************************/ - /* Compress Exchange */ - /*****************************************************/ - accelerator_inline void CompressExchange(SiteHalfSpinor *out0, - SiteHalfSpinor *out1, - const SiteSpinor *in, - Integer j,Integer k, Integer m,Integer type) const { - SiteHalfSpinor temp1, temp2,temp3,temp4; - SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0; - SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1; - projector::Proj(temp1,in[k],mu,dag); - projector::Proj(temp2,in[m],mu,dag); - exchange(temp3,temp4,temp1,temp2,type); - precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw); - precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw); - } - - /*****************************************************/ - /* Pass the info to the stencil */ - /*****************************************************/ - accelerator_inline bool DecompressionStep(void) const { return true; } - -}; -#endif - #define DECLARE_PROJ(Projector,Compressor,spProj) \ class Projector { \ public: \ diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 388094b2..992f4d20 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -148,12 +148,21 @@ void WilsonFermion5D::ImportGauge(const GaugeField &_Umu) GaugeField HUmu(_Umu.Grid()); HUmu = _Umu*(-0.5); if ( Dirichlet ) { - std::cout << GridLogDslash << " Dirichlet BCs 5d " <LocalDimensions()[d]; + if (GaugeBlock) assert( (GaugeBlock%ldim)==0); + } + } + if ( Dirichlet && (!this->Params.partialDirichlet) ) { + std::cout << GridLogMessage << " Dirichlet filtering gauge field BCs block " < Filter(GaugeBlock); Filter.applyFilter(HUmu); + } else { + std::cout << GridLogMessage << " Dirichlet "<< Dirichlet << " not filtered gauge field" < Date: Tue, 15 Nov 2022 16:21:45 -0500 Subject: [PATCH 172/240] Generic patch --- Grid/simd/Grid_generic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/simd/Grid_generic.h b/Grid/simd/Grid_generic.h index 35cb3bc3..4e527e33 100644 --- a/Grid/simd/Grid_generic.h +++ b/Grid/simd/Grid_generic.h @@ -244,7 +244,7 @@ struct Conj{ struct TimesMinusI{ // Complex template - accelerator_inline vec operator()(vec a, vec b){ + accelerator_inline vec operator()(vec a){ vec out; VECTOR_FOR(i, W::c, 1) @@ -265,7 +265,7 @@ struct TimesMinusI{ struct TimesI{ // Complex template - accelerator_inline vec operator()(vec a, vec b){ + accelerator_inline vec operator()(vec a){ vec out; VECTOR_FOR(i, W::c, 1) From 0db4f1803f520835f5e86eae71bc8d564191b266 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 16:23:41 -0500 Subject: [PATCH 173/240] Partial dirichlet support --- Grid/stencil/SimpleCompressor.h | 152 +++++++++++++++++++++++++------- 1 file changed, 120 insertions(+), 32 deletions(-) diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index b36d954f..eda6d9e7 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -3,26 +3,108 @@ NAMESPACE_BEGIN(Grid); -template -accelerator_inline void exchangeSIMT(vobj &mp0,vobj &mp1,const vobj &vp0,const vobj &vp1,Integer type) -{ - typedef decltype(coalescedRead(mp0)) sobj; - unsigned int Nsimd = vobj::Nsimd(); - unsigned int mask = Nsimd >> (type + 1); - int lane = acceleratorSIMTlane(Nsimd); - int j0 = lane &(~mask); // inner coor zero - int j1 = lane |(mask) ; // inner coor one - const vobj *vpa = &vp0; - const vobj *vpb = &vp1; - const vobj *vp = (lane&mask) ? (vpb) : (vpa); - auto sa = coalescedRead(vp[0],j0); - auto sb = coalescedRead(vp[0],j1); - coalescedWrite(mp0,sa); - coalescedWrite(mp1,sb); -} +class SimpleStencilParams{ +public: + Coordinate dirichlet; + int partialDirichlet; + SimpleStencilParams() { partialDirichlet = 0; }; +}; -template -class SimpleCompressor { + +// Compressors will inherit buffer management policies +// Standard comms buffer management +class FaceGatherSimple +{ +public: + static int PartialCompressionFactor(GridBase *grid) {return 1;}; + // Decompress is after merge so ok + template + static void Gather_plane_simple (commVector >& table, + const Lattice &rhs, + cobj *buffer, + compressor &compress, + int off,int so,int partial) + { + int num=table.size(); + std::pair *table_v = & table[0]; + + auto rhs_v = rhs.View(AcceleratorRead); + accelerator_forNB( i,num, vobj::Nsimd(), { + compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]); + }); + rhs_v.ViewClose(); + } + template + static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + Vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type,int partial) + { + assert( (table.size()&0x1)==0); + int num=table.size()/2; + int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + + auto rhs_v = rhs.View(AcceleratorRead); + auto p0=&pointers[0][0]; + auto p1=&pointers[1][0]; + auto tp=&table[0]; + auto rhs_p = &rhs_v[0]; + accelerator_forNB(j, num, vobj::Nsimd(), { + compress.CompressExchange(p0[j],p1[j], + rhs_p[so+tp[2*j ].second], + rhs_p[so+tp[2*j+1].second], + type); + }); + rhs_v.ViewClose(); + } + + template + static void DecompressFace(decompressor decompress,Decompression &dd) + { + auto kp = dd.kernel_p; + auto mp = dd.mpi_p; + accelerator_forNB(o,dd.buffer_size,1,{ + decompress.Decompress(kp[o],mp[o]); + }); + } + template + static void MergeFace(decompressor decompress,Merger &mm) + { + auto mp = &mm.mpointer[0]; + auto vp0= &mm.vpointers[0][0]; + auto vp1= &mm.vpointers[1][0]; + auto type= mm.type; + accelerator_forNB(o,mm.buffer_size/2,vobj::Nsimd(),{ + decompress.Exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + }); + } +}; + +//////////////////////////////////// +// Wilson compressor will add alternate policies for Dirichlet +// and possibly partial Dirichlet for DWF +//////////////////////////////////// +/* +class FaceGatherDirichlet +{ + // If it's dirichlet we don't assemble comms buffers + // + // Rely on zeroes in gauge field to drive the correct result + // NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute + template + static void Gather_plane_simple (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so){}; + template + static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + Vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type) {} + template + static void Merge(decompressor decompress,Merge &mm) { } + template + static void Decompress(decompressor decompress,Decompression &dd) {} +}; +*/ + +template +class SimpleCompressorGather : public FaceGather { public: void Point(int) {}; accelerator_inline int CommDatumSize(void) const { return sizeof(vobj); } @@ -30,20 +112,19 @@ public: accelerator_inline void Compress(vobj &buf,const vobj &in) const { coalescedWrite(buf,coalescedRead(in)); } - accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o) const { + accelerator_inline void Exchange(vobj &mp0,vobj &mp1,vobj &vp0,vobj &vp1,Integer type) const { #ifdef GRID_SIMT - exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + exchangeSIMT(mp0,mp1,vp0,vp1,type); #else - exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + exchange(mp0,mp1,vp0,vp1,type); #endif } - accelerator_inline void Decompress(vobj *out,vobj *in, int o) const { assert(0); } - accelerator_inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, - int j,int k, int m,int type) const { + accelerator_inline void Decompress(vobj &out,vobj &in) const { }; + accelerator_inline void CompressExchange(vobj &out0,vobj &out1,const vobj &in0,const vobj &in1,int type) const { #ifdef GRID_SIMT - exchangeSIMT(out0[j],out1[j],in[k],in[m],type); + exchangeSIMT(out0,out1,in0,in1,type); #else - exchange(out0[j],out1[j],in[k],in[m],type); + exchange(out0,out1,in0,in1,type); #endif } // For cshift. Cshift should drop compressor coupling altogether @@ -52,11 +133,18 @@ public: return arg; } }; -class SimpleStencilParams{ -public: - Coordinate dirichlet; - SimpleStencilParams() {}; -}; + +// Standard compressor never needs dirichlet. +// +// Get away with a local period wrap and rely on dirac operator to use a zero gauge link as it is faster +// +// Compressors that inherit Dirichlet and Non-dirichlet behaviour. +// +// Currently run-time behaviour through StencilParameters paramaters, p.dirichlet +// combined with the FaceGatherSimple behaviour + +template using SimpleCompressor = SimpleCompressorGather; +//template using SimpleCompressorDirichlet = SimpleCompressorGather; NAMESPACE_END(Grid); From e2e269e03b785114d8989b292e6052d1b236d3cc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 16:24:26 -0500 Subject: [PATCH 174/240] Partial dirichlet BCs --- Grid/stencil/Stencil.h | 204 ++++++++++++++++++++++++++--------------- 1 file changed, 131 insertions(+), 73 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index e23ff258..dd8b646e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -59,6 +59,7 @@ NAMESPACE_BEGIN(Grid); void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table); +/* template void Gather_plane_simple_table (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); @@ -103,6 +104,7 @@ void Gather_plane_exchange_table(commVector >& table,const La }); rhs_v.ViewClose(); } +*/ struct StencilEntry { #ifdef GRID_CUDA @@ -133,8 +135,18 @@ class CartesianStencilAccelerator { int _osites; StencilVector _directions; StencilVector _distances; - StencilVector _comms_send; - StencilVector _comms_recv; + /////////////////////////////////////////////////// + // If true, this is FULLY communicated per face + // Otherwise will either be full or partial dirichlet + /////////////////////////////////////////////////// + StencilVector _comms_send; + StencilVector _comms_recv; // this is FULLY communicated per face + /////////////////////////////////////////////////// + // If true, this is partially communicated per face + /////////////////////////////////////////////////// + StencilVector _comms_partial_send; + StencilVector _comms_partial_recv; + // StencilVector _comm_buf_size; StencilVector _permute_type; StencilVector same_node; @@ -229,7 +241,8 @@ public: Integer from_rank; Integer do_send; Integer do_recv; - Integer bytes; + Integer xbytes; + Integer rbytes; }; struct Merge { cobj * mpointer; @@ -237,11 +250,15 @@ public: Vector vpointers; Integer buffer_size; Integer type; + Integer partial; // partial dirichlet BCs + Coordinate dims; }; struct Decompress { cobj * kernel_p; cobj * mpi_p; Integer buffer_size; + Integer partial; // partial dirichlet BCs + Coordinate dims; }; struct CopyReceiveBuffer { void * from_p; @@ -252,7 +269,8 @@ public: Integer direction; Integer OrthogPlane; Integer DestProc; - Integer bytes; + Integer xbytes; + Integer rbytes; Integer lane; Integer cb; void *recv_buf; @@ -277,6 +295,7 @@ public: } int face_table_computed; + int partialDirichlet; std::vector > > face_table ; Vector surface_list; @@ -365,7 +384,7 @@ public: Packets[i].to_rank,Packets[i].do_send, Packets[i].recv_buf, Packets[i].from_rank,Packets[i].do_recv, - Packets[i].bytes,i); + Packets[i].xbytes,Packets[i].rbytes,i); } } @@ -501,7 +520,9 @@ public: } } - Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,Integer bytes,Integer cb) + Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane, + Integer xbytes,Integer rbytes, + Integer cb) { CachedTransfer obj; obj.direction = direction; @@ -509,19 +530,22 @@ public: obj.DestProc = DestProc; obj.recv_buf = recv_buf; obj.lane = lane; - obj.bytes = bytes; + obj.xbytes = xbytes; + obj.rbytes = rbytes; obj.cb = cb; for(int i=0;i &dv) { Decompress d; + d.partial = this->partialDirichlet; + d.dims = _grid->_fdimensions; d.kernel_p = k_p; d.mpi_p = m_p; d.buffer_size = buffer_size; @@ -552,6 +581,8 @@ public: } void AddMerge(cobj *merge_p,Vector &rpointers,Integer buffer_size,Integer type,std::vector &mv) { Merge m; + m.partial = this->partialDirichlet; + m.dims = _grid->_fdimensions; m.type = type; m.mpointer = merge_p; m.vpointers= rpointers; @@ -571,21 +602,10 @@ public: void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { for(int i=0;i_comms_send[ii] = 0; if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0; } + if ( partialDirichlet ) { + this->_comms_partial_send[ii] = !this->_comms_send[ii]; + this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; + } } } } @@ -691,7 +715,7 @@ public: this->same_node.resize(npoints); if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); - + partialDirichlet = p.partialDirichlet; DirichletBlock(p.dirichlet); // comms send/recv set up _unified_buffer_size=0; @@ -827,7 +851,7 @@ public: GridBase *grid=_grid; const int Nsimd = grid->Nsimd(); - int comms_recv = this->_comms_recv[point]; + int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ; int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1014,8 +1038,10 @@ public: { typedef typename cobj::vector_type vector_type; - int comms_send = this->_comms_send[point] ; - int comms_recv = this->_comms_recv[point] ; + int comms_send = this->_comms_send[point]; + int comms_recv = this->_comms_recv[point]; + int comms_partial_send = this->_comms_partial_send[point] ; + int comms_partial_recv = this->_comms_partial_recv[point] ; assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); @@ -1046,7 +1072,17 @@ public: if (cbmask != 0x3) words=words>>1; int bytes = words * compress.CommDatumSize(); + int xbytes; + int rbytes; + if ( comms_send ) xbytes = bytes; // Full send + else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); + else xbytes = 0; // full dirichlet + + if ( comms_recv ) rbytes = bytes; + else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); + else rbytes = 0; + int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int comm_off = u_comm_offset; @@ -1059,49 +1095,47 @@ public: assert (xmit_to_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank()); - if( comms_send ) { - - if ( !face_table_computed ) { - face_table.resize(face_idx+1); - std::vector > face_table_host ; - Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); + if ( !face_table_computed ) { + face_table.resize(face_idx+1); + std::vector > face_table_host ; + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); + // std::cout << "bytes expect "<< bytes << " " << face_table_host.size()* compress.CommDatumSize()<u_recv_buf_p; - } - - send_buf = this->u_send_buf_p; // Gather locally, must send - - //////////////////////////////////////////////////////// - // Gather locally - //////////////////////////////////////////////////////// - assert(send_buf!=NULL); - Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so); } - int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask); - if ( (!duplicate) ) { // Force comms for now + if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) { + recv_buf=u_simd_recv_buf[0]; + } else { + recv_buf=this->u_recv_buf_p; + } + + //////////////////////////////////////////////////////// + // Gather locally + //////////////////////////////////////////////////////// + send_buf = this->u_send_buf_p; // Gather locally, must send + assert(send_buf!=NULL); + + compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send); + + int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask); + if ( !duplicate ) { // Force comms for now + /////////////////////////////////////////////////////////// // Build a list of things to do after we synchronise GPUs // Start comms now??? /////////////////////////////////////////////////////////// AddPacket((void *)&send_buf[comm_off], (void *)&recv_buf[comm_off], - xmit_to_rank, comms_send, - recv_from_rank, comms_recv, - bytes); + xmit_to_rank, comms_send|comms_partial_send, + recv_from_rank, comms_recv|comms_partial_recv, + xbytes,rbytes); } - - if ( compress.DecompressionStep() && comms_recv ) { + + if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) { AddDecompress(&this->u_recv_buf_p[comm_off], &recv_buf[comm_off], words,Decompressions); @@ -1109,7 +1143,6 @@ public: u_comm_offset+=words; face_idx++; - } } return 0; @@ -1122,8 +1155,10 @@ public: const int maxl =2;// max layout in a direction - int comms_send = this->_comms_send[point] ; - int comms_recv = this->_comms_recv[point] ; + int comms_send = this->_comms_send[point]; + int comms_recv = this->_comms_recv[point]; + int comms_partial_send = this->_comms_partial_send[point] ; + int comms_partial_recv = this->_comms_partial_recv[point] ; int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1153,6 +1188,11 @@ public: int datum_bytes = compress.CommDatumSize(); int bytes = (reduced_buffer_size*datum_bytes)/simd_layout; + + // how many bytes on wire : partial dirichlet or dirichlet may set to < bytes + int xbytes; + int rbytes; + assert(bytes*simd_layout == reduced_buffer_size*datum_bytes); Vector rpointers(maxl); @@ -1182,22 +1222,37 @@ public: if ( !face_table_computed ) { face_table.resize(face_idx+1); std::vector > face_table_host ; - + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); face_table[face_idx].resize(face_table_host.size()); acceleratorCopyToDevice(&face_table_host[0], &face_table[face_idx][0], face_table[face_idx].size()*sizeof(face_table_host[0])); + } - if ( comms_send || comms_recv ) { - Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); + + if ( comms_send ) xbytes = bytes; + else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); + else xbytes = 0; + + if ( comms_recv ) rbytes = bytes; + else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); + else rbytes = 0; + + // Gathers SIMD lanes for send and merge + // Different faces can be full comms or partial comms with multiple ranks per node + if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) { + + int partial = partialDirichlet; + compressor::Gather_plane_exchange(face_table[face_idx],rhs, + spointers,dimension,sx,cbmask, + compress,permute_type,partial ); } face_idx++; - //spointers[0] -- low - //spointers[1] -- high - + //spointers[0] -- low simd coor + //spointers[1] -- high simd coor for(int i=0;iShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); rpointers[i] = rp; - - int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,bytes,cbmask); + + int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask); if ( !duplicate ) { + if ( (bytes != rbytes) && (rbytes!=0) ){ + acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero + } AddPacket((void *)sp,(void *)rp, - xmit_to_rank,comms_send, - recv_from_rank,comms_recv, - bytes); + xmit_to_rank,comms_send|comms_partial_send, + recv_from_rank,comms_recv|comms_partial_recv, + xbytes,rbytes); } } else { @@ -1238,7 +1296,7 @@ public: } } - if ( comms_recv ) { + if ( comms_recv|comms_partial_recv ) { AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); } From 7d302a525def70d276a2c6e87c50a15a57ee001a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 16:24:55 -0500 Subject: [PATCH 175/240] Natural place for this routine is here --- Grid/tensors/Tensor_SIMT.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index 0a7d3382..8015d74c 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -31,6 +31,27 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); +//////////////////////////////////////////////// +// Inside a GPU thread +//////////////////////////////////////////////// +template +accelerator_inline void exchangeSIMT(vobj &mp0,vobj &mp1,const vobj &vp0,const vobj &vp1,Integer type) +{ + typedef decltype(coalescedRead(mp0)) sobj; + unsigned int Nsimd = vobj::Nsimd(); + unsigned int mask = Nsimd >> (type + 1); + int lane = acceleratorSIMTlane(Nsimd); + int j0 = lane &(~mask); // inner coor zero + int j1 = lane |(mask) ; // inner coor one + const vobj *vpa = &vp0; + const vobj *vpb = &vp1; + const vobj *vp = (lane&mask) ? (vpb) : (vpa); + auto sa = coalescedRead(vp[0],j0); + auto sb = coalescedRead(vp[0],j1); + coalescedWrite(mp0,sa); + coalescedWrite(mp1,sb); +} + #ifndef GRID_SIMT ////////////////////////////////////////// From 0352da34f019bea2fd5a7117600f3da68cf40179 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 16:26:49 -0500 Subject: [PATCH 176/240] Several deleted files --- systems/Spock/comms.slurm | 26 -------------------------- systems/Spock/config-command | 14 -------------- systems/Spock/dwf.slurm | 26 -------------------------- systems/Spock/dwf4.slurm | 26 -------------------------- systems/Spock/dwf8.slurm | 24 ------------------------ systems/Spock/mpiwrapper.sh | 12 ------------ systems/Spock/sourceme.sh | 9 --------- 7 files changed, 137 deletions(-) delete mode 100644 systems/Spock/comms.slurm delete mode 100644 systems/Spock/config-command delete mode 100644 systems/Spock/dwf.slurm delete mode 100644 systems/Spock/dwf4.slurm delete mode 100644 systems/Spock/dwf8.slurm delete mode 100755 systems/Spock/mpiwrapper.sh delete mode 100644 systems/Spock/sourceme.sh diff --git a/systems/Spock/comms.slurm b/systems/Spock/comms.slurm deleted file mode 100644 index 0841fda0..00000000 --- a/systems/Spock/comms.slurm +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# Begin LSF Directives -#SBATCH -A LGT104 -#SBATCH -t 01:00:00 -##SBATCH -U openmpThu -#SBATCH -p ecp -#SBATCH -J comms -#SBATCH -o comms.%J -#SBATCH -e comms.%J -#SBATCH -N 1 -#SBATCH -n 2 - -DIR=. -module list -export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 -export MPICH_GPU_SUPPORT_ENABLED=1 -#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM -#export MPICH_SMP_SINGLE_COPY_MODE=CMA -export MPICH_SMP_SINGLE_COPY_MODE=NONE -export OMP_NUM_THREADS=8 - -AT=8 -echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads ${AT} --grid 64.64.32.32 --mpi 2.1.1.1 " -srun -n2 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_comms_host_device $PARAMS - diff --git a/systems/Spock/config-command b/systems/Spock/config-command deleted file mode 100644 index 3ffefe4f..00000000 --- a/systems/Spock/config-command +++ /dev/null @@ -1,14 +0,0 @@ -../../configure --enable-comms=mpi-auto \ ---enable-unified=no \ ---enable-shm=nvlink \ ---enable-accelerator=hip \ ---enable-gen-simd-width=64 \ ---enable-simd=GPU \ ---disable-fermion-reps \ ---disable-gparity \ ---with-gmp=$OLCF_GMP_ROOT \ ---with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ -CXX=hipcc MPICXX=mpicxx \ -CXXFLAGS="-fPIC -I/opt/rocm-4.3.0/include/ -std=c++14 -I${MPICH_DIR}/include " \ ---prefix=/ccs/home/chulwoo/Grid \ - LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa " diff --git a/systems/Spock/dwf.slurm b/systems/Spock/dwf.slurm deleted file mode 100644 index 7144a270..00000000 --- a/systems/Spock/dwf.slurm +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# Begin LSF Directives -#SBATCH -A LGT104 -#SBATCH -t 01:00:00 -##SBATCH -U openmpThu -#SBATCH -p ecp -#SBATCH -J DWF -#SBATCH -o DWF.%J -#SBATCH -e DWF.%J -#SBATCH -N 1 -#SBATCH -n 1 - -DIR=. -module list -export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 -export MPICH_GPU_SUPPORT_ENABLED=1 -#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM -#export MPICH_SMP_SINGLE_COPY_MODE=NONE -export MPICH_SMP_SINGLE_COPY_MODE=CMA -export OMP_NUM_THREADS=8 - -AT=8 -echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads ${AT} --grid 32.32.32.32 --mpi 1.1.1.1 --comms-overlap" -srun -n1 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS - diff --git a/systems/Spock/dwf4.slurm b/systems/Spock/dwf4.slurm deleted file mode 100644 index 261929ab..00000000 --- a/systems/Spock/dwf4.slurm +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# Begin LSF Directives -#SBATCH -A LGT104 -#SBATCH -t 01:00:00 -##SBATCH -U openmpThu -#SBATCH -p ecp -#SBATCH -J DWF -#SBATCH -o DWF.%J -#SBATCH -e DWF.%J -#SBATCH -N 1 -#SBATCH -n 4 - -DIR=. -module list -export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 -export MPICH_GPU_SUPPORT_ENABLED=1 -#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM -export MPICH_SMP_SINGLE_COPY_MODE=NONE -#export MPICH_SMP_SINGLE_COPY_MODE=CMA -export OMP_NUM_THREADS=8 - -AT=8 -echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads ${AT} --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0" -srun -n4 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS - diff --git a/systems/Spock/dwf8.slurm b/systems/Spock/dwf8.slurm deleted file mode 100644 index f2e12b97..00000000 --- a/systems/Spock/dwf8.slurm +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -# Begin LSF Directives -#SBATCH -A LGT104 -#SBATCH -t 3:00:00 -#SBATCH -p ecp -#SBATCH -J DWF -#SBATCH -o DWF.%J -#SBATCH -e DWF.%J -#SBATCH -N 2 -#SBATCH -n 8 - -DIR=. -module list -export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 -export MPICH_GPU_SUPPORT_ENABLED=1 -export MPICH_SMP_SINGLE_COPY_MODE=CMA - -export OMP_NUM_THREADS=8 - -AT=8 -echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads ${AT} --grid 16.16.16.48 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0" -srun -N2 -n8 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./HMC/Mobius2p1f_DD_RHMC $PARAMS - diff --git a/systems/Spock/mpiwrapper.sh b/systems/Spock/mpiwrapper.sh deleted file mode 100755 index 76c4e364..00000000 --- a/systems/Spock/mpiwrapper.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -lrank=$SLURM_LOCALID - -export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID - -echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING" - -$* - - - diff --git a/systems/Spock/sourceme.sh b/systems/Spock/sourceme.sh deleted file mode 100644 index 415341d0..00000000 --- a/systems/Spock/sourceme.sh +++ /dev/null @@ -1,9 +0,0 @@ -module load emacs -module load PrgEnv-gnu -module load rocm/4.5.0 -module load gmp -module load cray-fftw -module load craype-accel-amd-gfx908 -export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 -export MPICH_GPU_SUPPORT_ENABLED=1 -export LD_LIBRARY_PATH=/opt/cray/pe/gcc/mpfr/3.1.4/lib/:$LD_LIBRARY_PATH From 45a001e078c2a1a6bbefa82cdd5d24d6ac93579d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 16:27:20 -0500 Subject: [PATCH 177/240] Debug compile --- systems/mac-arm/config-command-mpi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/systems/mac-arm/config-command-mpi b/systems/mac-arm/config-command-mpi index 52506559..e91d74e6 100644 --- a/systems/mac-arm/config-command-mpi +++ b/systems/mac-arm/config-command-mpi @@ -1,3 +1,3 @@ -#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi -CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU-RRII --enable-comms=mpi +CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi +#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU-RRII --enable-comms=mpi #CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU --enable-debug --enable-comms=mpi From e74666a09c7a160a31684c7a073778e9745cb97b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 16:34:21 -0500 Subject: [PATCH 178/240] Double length vector type for fast precision change --- Grid/simd/Grid_doubled_vector.h | 666 ++++++++++++++++++++++++++++++++ 1 file changed, 666 insertions(+) create mode 100644 Grid/simd/Grid_doubled_vector.h diff --git a/Grid/simd/Grid_doubled_vector.h b/Grid/simd/Grid_doubled_vector.h new file mode 100644 index 00000000..ee604750 --- /dev/null +++ b/Grid/simd/Grid_doubled_vector.h @@ -0,0 +1,666 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Grid_vector_types.h + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +class Grid_simd2 { +public: + typedef typename RealPart::type Real; + typedef Vector_type vector_type; + typedef Scalar_type scalar_type; + + typedef union conv_t_union { + Vector_type v; + Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)]; + accelerator_inline conv_t_union(){}; + } conv_t; + + static constexpr int nvec=2; + Vector_type v[nvec]; + + static accelerator_inline constexpr int Nsimd(void) { + static_assert( (sizeof(Vector_type) / sizeof(Scalar_type) >= 1), " size mismatch " ); + + return nvec*sizeof(Vector_type) / sizeof(Scalar_type); + } + + accelerator_inline Grid_simd2 &operator=(const Grid_simd2 &&rhs) { + for(int n=0;n accelerator_inline + Grid_simd2(const typename std::enable_if::value, S>::type a) { + vsplat(*this, a); + }; + + ///////////////////////////// + // Constructors + ///////////////////////////// + accelerator_inline Grid_simd2 & operator=(const Zero &z) { + vzero(*this); + return (*this); + } + + /////////////////////////////////////////////// + // mac, mult, sub, add, adj + /////////////////////////////////////////////// + + friend accelerator_inline void mac(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ a, + const Grid_simd2 *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + + friend accelerator_inline void mult(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) * (*r); + } + + friend accelerator_inline void sub(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) - (*r); + } + friend accelerator_inline void add(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) + (*r); + } + friend accelerator_inline void mac(Grid_simd2 *__restrict__ y, + const Scalar_type *__restrict__ a, + const Grid_simd2 *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + friend accelerator_inline void mult(Grid_simd2 *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) * (*r); + } + friend accelerator_inline void sub(Grid_simd2 *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) - (*r); + } + friend accelerator_inline void add(Grid_simd2 *__restrict__ y, + const Scalar_type *__restrict__ l, + const Grid_simd2 *__restrict__ r) { + *y = (*l) + (*r); + } + + friend accelerator_inline void mac(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ a, + const Scalar_type *__restrict__ x) { + *y = (*a) * (*x) + (*y); + }; + friend accelerator_inline void mult(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) * (*r); + } + friend accelerator_inline void sub(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) - (*r); + } + friend accelerator_inline void add(Grid_simd2 *__restrict__ y, + const Grid_simd2 *__restrict__ l, + const Scalar_type *__restrict__ r) { + *y = (*l) + (*r); + } + + //////////////////////////////////////////////////////////////////////// + // FIXME: gonna remove these load/store, get, set, prefetch + //////////////////////////////////////////////////////////////////////// + friend accelerator_inline void vset(Grid_simd2 &ret, Scalar_type *a) { + for(int n=0;n + friend accelerator_inline Grid_simd2 SimdApply(const functor &func, const Grid_simd2 &v) { + Grid_simd2 ret; + for(int n=0;n + friend accelerator_inline Grid_simd2 SimdApplyBinop(const functor &func, + const Grid_simd2 &x, + const Grid_simd2 &y) { + Grid_simd2 ret; + for(int n=0;n Al Bl Ah,Bh + /////////////////////// + friend accelerator_inline void exchange0(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ + out1.v[0] = in1.v[0]; + out1.v[1] = in2.v[0]; + out2.v[0] = in1.v[1]; + out2.v[1] = in2.v[1]; + } + friend accelerator_inline void exchange1(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ + exchange0(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); + exchange0(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); + } + friend accelerator_inline void exchange2(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ + exchange1(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); + exchange1(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); + } + friend accelerator_inline void exchange3(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ + exchange2(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); + exchange2(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); + } + friend accelerator_inline void exchange4(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ + exchange3(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); + exchange3(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); + } + friend accelerator_inline void exchange(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2,int n) + { + if (n==3) { + exchange3(out1,out2,in1,in2); + } else if(n==2) { + exchange2(out1,out2,in1,in2); + } else if(n==1) { + exchange1(out1,out2,in1,in2); + } else if(n==0) { + exchange0(out1,out2,in1,in2); + } + } + //////////////////////////////////////////////////////////////////// + // General permute; assumes vector length is same across + // all subtypes; may not be a good assumption, but could + // add the vector width as a template param for BG/Q for example + //////////////////////////////////////////////////////////////////// + friend accelerator_inline void permute0(Grid_simd2 &y, Grid_simd2 b) { + y.v[0]=b.v[1]; + y.v[1]=b.v[0]; + } + friend accelerator_inline void permute1(Grid_simd2 &y, Grid_simd2 b) { + permute0(y.v[0],b.v[0]); + permute0(y.v[1],b.v[1]); + } + friend accelerator_inline void permute2(Grid_simd2 &y, Grid_simd2 b) { + permute1(y.v[0],b.v[0]); + permute1(y.v[1],b.v[1]); + } + friend accelerator_inline void permute3(Grid_simd2 &y, Grid_simd2 b) { + permute2(y.v[0],b.v[0]); + permute2(y.v[1],b.v[1]); + } + friend accelerator_inline void permute4(Grid_simd2 &y, Grid_simd2 b) { + permute3(y.v[0],b.v[0]); + permute3(y.v[1],b.v[1]); + } + friend accelerator_inline void permute(Grid_simd2 &y, Grid_simd2 b, int perm) { + if(perm==3) permute3(y, b); + else if(perm==2) permute2(y, b); + else if(perm==1) permute1(y, b); + else if(perm==0) permute0(y, b); + } + + /////////////////////////////// + // Getting single lanes + /////////////////////////////// + accelerator_inline Scalar_type getlane(int lane) const { + if(lane < vector_type::Nsimd() ) return v[0].getlane(lane); + else return v[1].getlane(lane%vector_type::Nsimd()); + } + + accelerator_inline void putlane(const Scalar_type &S, int lane){ + if(lane < vector_type::Nsimd() ) v[0].putlane(S,lane); + else v[1].putlane(S,lane%vector_type::Nsimd()); + } +}; // end of Grid_simd2 class definition + +/////////////////////////////// +// Define available types +/////////////////////////////// + +typedef Grid_simd2 , vComplexD> vComplexD2; +typedef Grid_simd2 vRealD2; + + + +///////////////////////////////////////// +// Some traits to recognise the types +///////////////////////////////////////// +template +struct is_simd : public std::false_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; +template <> struct is_simd : public std::true_type {}; + +template using IfSimd = Invoke::value, int> >; +template using IfNotSimd = Invoke::value, unsigned> >; + +/////////////////////////////////////////////// +// insert / extract with complex support +/////////////////////////////////////////////// +template +accelerator_inline S getlane(const Grid_simd &in,int lane) { + return in.getlane(lane); +} +template +accelerator_inline void putlane(Grid_simd &vec,const S &_S, int lane){ + vec.putlane(_S,lane); +} +template = 0 > +accelerator_inline S getlane(const S &in,int lane) { + return in; +} +template = 0 > +accelerator_inline void putlane(S &vec,const S &_S, int lane){ + vec = _S; +} +template +accelerator_inline S getlane(const Grid_simd2 &in,int lane) { + return in.getlane(lane); +} +template +accelerator_inline void putlane(Grid_simd2 &vec,const S &_S, int lane){ + vec.putlane(_S,lane); +} + + +//////////////////////////////////////////////////////////////////// +// General rotate +//////////////////////////////////////////////////////////////////// + +template +accelerator_inline void vbroadcast(Grid_simd2 &ret,const Grid_simd2 &src,int lane){ + S* typepun =(S*) &src; + vsplat(ret,typepun[lane]); +} +template =0> +accelerator_inline void rbroadcast(Grid_simd2 &ret,const Grid_simd2 &src,int lane){ + typedef typename V::vector_type vector_type; + S* typepun =(S*) &src; + ret.v[0].v = unary(real(typepun[lane]), VsplatSIMD()); + ret.v[1].v = unary(real(typepun[lane]), VsplatSIMD()); +} + + +/////////////////////// +// Splat +/////////////////////// + +// this is only for the complex version +template = 0, class ABtype> +accelerator_inline void vsplat(Grid_simd2 &ret, ABtype a, ABtype b) { + vsplat(ret.v[0],a,b); + vsplat(ret.v[1],a,b); +} + +// overload if complex +template +accelerator_inline void vsplat(Grid_simd2 &ret, EnableIf, S> c) { + vsplat(ret, real(c), imag(c)); +} +template +accelerator_inline void rsplat(Grid_simd2 &ret, EnableIf, S> c) { + vsplat(ret, real(c), real(c)); +} + +// if real fill with a, if complex fill with a in the real part (first function +// above) +template +accelerator_inline void vsplat(Grid_simd2 &ret, NotEnableIf, S> a) +{ + vsplat(ret.v[0],a); + vsplat(ret.v[1],a); +} +////////////////////////// + +/////////////////////////////////////////////// +// Initialise to 1,0,i for the correct types +/////////////////////////////////////////////// +// For complex types +template = 0> +accelerator_inline void vone(Grid_simd2 &ret) { + vsplat(ret, S(1.0, 0.0)); +} +template = 0> +accelerator_inline void vzero(Grid_simd2 &ret) { + vsplat(ret, S(0.0, 0.0)); +} // use xor? +template = 0> +accelerator_inline void vcomplex_i(Grid_simd2 &ret) { + vsplat(ret, S(0.0, 1.0)); +} + +template = 0> +accelerator_inline void visign(Grid_simd2 &ret) { + vsplat(ret, S(1.0, -1.0)); +} +template = 0> +accelerator_inline void vrsign(Grid_simd2 &ret) { + vsplat(ret, S(-1.0, 1.0)); +} + +// if not complex overload here +template = 0> +accelerator_inline void vone(Grid_simd2 &ret) { + vsplat(ret, S(1.0)); +} +template = 0> +accelerator_inline void vzero(Grid_simd2 &ret) { + vsplat(ret, S(0.0)); +} + +// For integral types +template = 0> +accelerator_inline void vone(Grid_simd2 &ret) { + vsplat(ret, 1); +} +template = 0> +accelerator_inline void vzero(Grid_simd2 &ret) { + vsplat(ret, 0); +} +template = 0> +accelerator_inline void vtrue(Grid_simd2 &ret) { + vsplat(ret, 0xFFFFFFFF); +} +template = 0> +accelerator_inline void vfalse(Grid_simd2 &ret) { + vsplat(ret, 0); +} +template +accelerator_inline void zeroit(Grid_simd2 &z) { + vzero(z); +} + +/////////////////////// +// Vstream +/////////////////////// +template = 0> +accelerator_inline void vstream(Grid_simd2 &out, const Grid_simd2 &in) { + vstream(out.v[0],in.v[0]); + vstream(out.v[1],in.v[1]); +} +template = 0> +accelerator_inline void vstream(Grid_simd2 &out, const Grid_simd2 &in) { + vstream(out.v[0],in.v[0]); + vstream(out.v[1],in.v[1]); +} +template = 0> +accelerator_inline void vstream(Grid_simd2 &out, const Grid_simd2 &in) { + vstream(out.v[0],in.v[0]); + vstream(out.v[1],in.v[1]); +} + +//////////////////////////////////// +// Arithmetic operator overloads +,-,* +//////////////////////////////////// +template +accelerator_inline Grid_simd2 operator+(Grid_simd2 a, Grid_simd2 b) { + Grid_simd2 ret; + ret.v[0] = a.v[0]+b.v[0]; + ret.v[1] = a.v[1]+b.v[1]; + return ret; +}; + +template +accelerator_inline Grid_simd2 operator-(Grid_simd2 a, Grid_simd2 b) { + Grid_simd2 ret; + ret.v[0] = a.v[0]-b.v[0]; + ret.v[1] = a.v[1]-b.v[1]; + return ret; +}; + +// Distinguish between complex types and others +template = 0> +accelerator_inline Grid_simd2 real_mult(Grid_simd2 a, Grid_simd2 b) { + Grid_simd2 ret; + ret.v[0] =real_mult(a.v[0],b.v[0]); + ret.v[1] =real_mult(a.v[1],b.v[1]); + return ret; +}; +template = 0> +accelerator_inline Grid_simd2 real_madd(Grid_simd2 a, Grid_simd2 b, Grid_simd2 c) { + Grid_simd2 ret; + ret.v[0] =real_madd(a.v[0],b.v[0],c.v[0]); + ret.v[1] =real_madd(a.v[1],b.v[1],c.v[1]); + return ret; +}; + + +// Distinguish between complex types and others +template +accelerator_inline Grid_simd2 operator*(Grid_simd2 a, Grid_simd2 b) { + Grid_simd2 ret; + ret.v[0] = a.v[0]*b.v[0]; + ret.v[1] = a.v[1]*b.v[1]; + return ret; +}; + +// Distinguish between complex types and others +template +accelerator_inline Grid_simd2 operator/(Grid_simd2 a, Grid_simd2 b) { + Grid_simd2 ret; + ret.v[0] = a.v[0]/b.v[0]; + ret.v[1] = a.v[1]/b.v[1]; + return ret; +}; + +/////////////////////// +// Conjugate +/////////////////////// +template +accelerator_inline Grid_simd2 conjugate(const Grid_simd2 &in) { + Grid_simd2 ret; + ret.v[0] = conjugate(in.v[0]); + ret.v[1] = conjugate(in.v[1]); + return ret; +} +template = 0> +accelerator_inline Grid_simd2 adj(const Grid_simd2 &in) { + return conjugate(in); +} + +/////////////////////// +// timesMinusI +/////////////////////// +template +accelerator_inline void timesMinusI(Grid_simd2 &ret, const Grid_simd2 &in) { + timesMinusI(ret.v[0],in.v[0]); + timesMinusI(ret.v[1],in.v[1]); +} +template +accelerator_inline Grid_simd2 timesMinusI(const Grid_simd2 &in) { + Grid_simd2 ret; + timesMinusI(ret.v[0],in.v[0]); + timesMinusI(ret.v[1],in.v[1]); + return ret; +} + +/////////////////////// +// timesI +/////////////////////// +template +accelerator_inline void timesI(Grid_simd2 &ret, const Grid_simd2 &in) { + timesI(ret.v[0],in.v[0]); + timesI(ret.v[1],in.v[1]); +} +template +accelerator_inline Grid_simd2 timesI(const Grid_simd2 &in) { + Grid_simd2 ret; + timesI(ret.v[0],in.v[0]); + timesI(ret.v[1],in.v[1]); + return ret; +} + +///////////////////// +// Inner, outer +///////////////////// +template +accelerator_inline Grid_simd2 innerProduct(const Grid_simd2 &l,const Grid_simd2 &r) { + return conjugate(l) * r; +} +template +accelerator_inline Grid_simd2 outerProduct(const Grid_simd2 &l,const Grid_simd2 &r) { + return l * conjugate(r); +} + +template +accelerator_inline Grid_simd2 trace(const Grid_simd2 &arg) { + return arg; +} + +//////////////////////////////////////////////////////////// +// copy/splat complex real parts into real; +// insert real into complex and zero imag; +//////////////////////////////////////////////////////////// +accelerator_inline void precisionChange(vComplexD2 &out,const vComplexF &in){ + Optimization::PrecisionChange::StoD(in.v,out.v[0].v,out.v[1].v); +} +accelerator_inline void precisionChange(vComplexF &out,const vComplexD2 &in){ + out.v=Optimization::PrecisionChange::DtoS(in.v[0].v,in.v[1].v); +} +accelerator_inline void precisionChange(vComplexD2 *out,const vComplexF *in,int nvec){ + for(int m=0;m Date: Tue, 15 Nov 2022 16:36:46 -0500 Subject: [PATCH 179/240] Record some perturbative free field calculation --- examples/Example_taku.cc | 383 +++++++++++++++++++++++++++++++++ examples/Example_taku2.cc | 433 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 816 insertions(+) create mode 100644 examples/Example_taku.cc create mode 100644 examples/Example_taku2.cc diff --git a/examples/Example_taku.cc b/examples/Example_taku.cc new file mode 100644 index 00000000..b9ad272e --- /dev/null +++ b/examples/Example_taku.cc @@ -0,0 +1,383 @@ +/* + * Warning: This code illustrative only: not well tested, and not meant for production use + * without regression / tests being applied + */ + +#include + +using namespace std; +using namespace Grid; + +RealD LLscale =1.0; +RealD LCscale =1.0; + +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +void MakePhase(Coordinate mom,LatticeComplex &phase) +{ + GridBase *grid = phase.Grid(); + auto latt_size = grid->GlobalDimensions(); + ComplexD ci(0.0,1.0); + phase=Zero(); + + LatticeComplex coor(phase.Grid()); + for(int mu=0;mu +void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) +{ + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + Integer Iterations = 40; + Real width = 2.0; + Real coeff = (width*width) / Real(4*Iterations); + + Field tmp(U.Grid()); + smeared=unsmeared; + // chi = (1-p^2/2N)^N kronecker + for(int n = 0; n < Iterations; ++n) { + Laplacian.M(smeared,tmp); + smeared = smeared - coeff*tmp; + std::cout << " smear iter " << n<<" " < +void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + GridBase *UGrid = D.GaugeGrid(); + GridBase *FGrid = D.FermionGrid(); + + LatticeFermion src4 (UGrid); + LatticeFermion src5 (FGrid); + LatticeFermion result5(FGrid); + LatticeFermion result4(UGrid); + LatticePropagator prop5(FGrid); + + ConjugateGradient CG(1.0e-8,100000); + SchurRedBlackDiagMooeeSolve schur(CG); + ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors + for(int s=0;s(src4,source,s,c); + + D.ImportPhysicalFermionSource(src4,src5); + + result5=Zero(); + schur(D,src5,result5,ZG); + std::cout<(prop5,result5,s,c); + FermToProp(propagator,result4,s,c); + } + } + LatticePropagator Axial_mu(UGrid); + LatticePropagator Vector_mu(UGrid); + + LatticeComplex PA (UGrid); + LatticeComplex VV (UGrid); + LatticeComplex PJ5q(UGrid); + LatticeComplex PP (UGrid); + + std::vector sumPA; + std::vector sumVV; + std::vector sumPP; + std::vector sumPJ5q; + + Gamma g5(Gamma::Algebra::Gamma5); + D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); + PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current + sliceSum(PA,sumPA,Tdir); + + int Nt{static_cast(sumPA.size())}; + + for(int t=0;t >, data); +}; + +void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) +{ + const int nchannel=3; + Gamma::Algebra Gammas[nchannel][2] = { + {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX}, + {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY}, + {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ} + }; + + Gamma G5(Gamma::Algebra::Gamma5); + + LatticeComplex meson_CF(q1.Grid()); + MesonFile MF; + + for(int ch=0;ch meson_T; + sliceSum(meson_CF,meson_T, Tdir); + + int nt=meson_T.size(); + + std::vector corr(nt); + for(int t=0;t seeds4({1,2,3,4}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu(UGrid); + std::string config; + RealD M5=1.8; + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + config="ColdConfig"; + // RealD P=1.0; // Don't scale + RealD P=0.5871119; // 48I + // RealD P=0.6153342; // 64I + // RealD P=0.6388238 // 32Ifine + RealD u0 = sqrt(sqrt(P)); + RealD M5mf = M5 - 4.0*(1.0-u0); + RealD w0 = 1.0 - M5mf; +#if 0 + // M5=1.8 with U=u0 + Umu = Umu * u0; + LLscale = 1.0; + LCscale = 1.0; + std::cout< PointProps(nmass,UGrid); + // std::vector GaussProps(nmass,UGrid); + // std::vector Z2Props (nmass,UGrid); + + for(int m=0;m + +using namespace std; +using namespace Grid; + +RealD LLscale =1.0; +RealD LCscale =1.0; + +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +void MakePhase(Coordinate mom,LatticeComplex &phase) +{ + GridBase *grid = phase.Grid(); + auto latt_size = grid->GlobalDimensions(); + ComplexD ci(0.0,1.0); + phase=Zero(); + + LatticeComplex coor(phase.Grid()); + for(int mu=0;mu +void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) +{ + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + Integer Iterations = 40; + Real width = 2.0; + Real coeff = (width*width) / Real(4*Iterations); + + Field tmp(U.Grid()); + smeared=unsmeared; + // chi = (1-p^2/2N)^N kronecker + for(int n = 0; n < Iterations; ++n) { + Laplacian.M(smeared,tmp); + smeared = smeared - coeff*tmp; + std::cout << " smear iter " << n<<" " < +void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + GridBase *UGrid = source.Grid(); + GridBase *FGrid = D.FermionGrid(); + bool fiveD = true; //calculate 4d free propagator + RealD mass = D.Mass(); + LatticeFermion src4 (UGrid); + LatticeFermion result4 (UGrid); + LatticeFermion result5(FGrid); + LatticeFermion src5(FGrid); + LatticePropagator prop5(FGrid); + for(int s=0;s(src4,source,s,c); + + D.ImportPhysicalFermionSource(src4,src5); + D.FreePropagator(src5,result5,mass,true); + std::cout<(prop5,result5,s,c); + FermToProp(propagator,result4,s,c); + } + } + + LatticePropagator Vector_mu(UGrid); + LatticeComplex VV (UGrid); + std::vector sumVV; + Gamma::Algebra GammaV[3] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ + }; + for( int mu=0;mu<3;mu++ ) { + Gamma gV(GammaV[mu]); + D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); + VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current + sliceSum(VV,sumVV,Tdir); + int Nt = sumVV.size(); + for(int t=0;t +void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + GridBase *UGrid = D.GaugeGrid(); + GridBase *FGrid = D.FermionGrid(); + + LatticeFermion src4 (UGrid); + LatticeFermion src5 (FGrid); + LatticeFermion result5(FGrid); + LatticeFermion result4(UGrid); + LatticePropagator prop5(FGrid); + + ConjugateGradient CG(1.0e-6,100000); + SchurRedBlackDiagMooeeSolve schur(CG); + ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors + for(int s=0;s(src4,source,s,c); + + D.ImportPhysicalFermionSource(src4,src5); + + result5=Zero(); + schur(D,src5,result5,ZG); + std::cout<(prop5,result5,s,c); + FermToProp(propagator,result4,s,c); + } + } + LatticePropagator Axial_mu(UGrid); + LatticePropagator Vector_mu(UGrid); + + LatticeComplex PA (UGrid); + LatticeComplex VV (UGrid); + LatticeComplex PJ5q(UGrid); + LatticeComplex PP (UGrid); + + std::vector sumPA; + std::vector sumVV; + std::vector sumPP; + std::vector sumPJ5q; + + Gamma g5(Gamma::Algebra::Gamma5); + D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); + PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current + sliceSum(PA,sumPA,Tdir); + + int Nt{static_cast(sumPA.size())}; + + for(int t=0;t >, data); +}; + +void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) +{ + const int nchannel=3; + Gamma::Algebra Gammas[nchannel][2] = { + {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX}, + {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY}, + // {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ} + {Gamma::Algebra::Gamma5,Gamma::Algebra::Gamma5} + }; + + Gamma G5(Gamma::Algebra::Gamma5); + + LatticeComplex meson_CF(q1.Grid()); + MesonFile MF; + + for(int ch=0;ch meson_T; + sliceSum(meson_CF,meson_T, Tdir); + + int nt=meson_T.size(); + + std::vector corr(nt); + for(int t=0;t seeds4({1,2,3,4}); + // GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu(UGrid); + std::string config; + RealD M5=atof(getenv("M5")); + RealD mq = atof(getenv("mass")); + std::vector masses({ mq} ); // u/d, s, c ?? + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + config="ColdConfig"; + // RealD P=1.0; // Don't scale + // RealD P=0.6153342; // 64I + // RealD P=0.6388238 // 32Ifine + // RealD P=0.5871119; // 48I + // RealD u0 = sqrt(sqrt(P)); + // Umu = Umu * u0; + RealD w0 = 1 - M5; + LLscale = 1.0/(1-w0*w0)/(1-w0*w0); + LCscale = 1.0/(1-w0*w0)/(1-w0*w0); + std::cout< PointProps(nmass,UGrid); + std::vector FreeProps(nmass,UGrid); + LatticePropagator delta(UGrid); + + for(int m=0;m Date: Tue, 15 Nov 2022 16:37:15 -0500 Subject: [PATCH 180/240] Partial dirichlet changes --- benchmarks/Benchmark_comms.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 21e048f4..5ffca2b6 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -220,7 +220,7 @@ int main (int argc, char ** argv) xmit_to_rank,1, (void *)&rbuf[mu][0], recv_from_rank,1, - bytes,mu); + bytes,bytes,mu); comm_proc = mpi_layout[mu]-1; @@ -231,7 +231,7 @@ int main (int argc, char ** argv) xmit_to_rank,1, (void *)&rbuf[mu+4][0], recv_from_rank,1, - bytes,mu+4); + bytes,bytes,mu+4); } } @@ -312,7 +312,7 @@ int main (int argc, char ** argv) xmit_to_rank,1, (void *)&rbuf[mu][0], recv_from_rank,1, - bytes,mu); + bytes,bytes,mu); Grid.StencilSendToRecvFromComplete(requests,mu); requests.resize(0); @@ -325,7 +325,7 @@ int main (int argc, char ** argv) xmit_to_rank,1, (void *)&rbuf[mu+4][0], recv_from_rank,1, - bytes,mu+4); + bytes,bytes,mu+4); Grid.StencilSendToRecvFromComplete(requests,mu+4); requests.resize(0); @@ -412,7 +412,7 @@ int main (int argc, char ** argv) } int tid = omp_get_thread_num(); tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1, - (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid); + (void *)&rbuf[dir][0], recv_from_rank,1, bytes,bytes,tid); thread_critical { dbytes+=tbytes; } } From 1af7572c61754b70cf0e8cfee3ee6cfd64cffa04 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 16:38:51 -0500 Subject: [PATCH 181/240] Some test HMCs for DDHMC --- HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc | 520 ++++++++++++++++++++++++ HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc | 567 +++++++++++++++++++++++++++ 2 files changed, 1087 insertions(+) create mode 100644 HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc create mode 100644 HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc b/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc new file mode 100644 index 00000000..24d8951e --- /dev/null +++ b/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc @@ -0,0 +1,520 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_hmc_EODWFRatio.cc + +Copyright (C) 2015-2016 + +Author: Peter Boyle +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +template + class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + Integer TotalInnerIterations; //Number of inner CG iterations + Integer TotalOuterIterations; //Number of restarts + Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + + MixedPrecisionConjugateGradientOperatorFunction(RealD tol, + Integer maxinnerit, + Integer maxouterit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + InnerTolerance(tol), + MaxInnerIterations(maxinnerit), + MaxOuterIterations(maxouterit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5), + OuterLoopNormMult(100.) + { + /* Debugging instances of objects; references are stored + std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " < &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<(&LinOpU); + + // std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <_Mat)<_Mat)==&(LinOpD._Mat)); + + //////////////////////////////////////////////////////////////////////////////////// + // Must snarf a single precision copy of the gauge field in Linop_d argument + //////////////////////////////////////////////////////////////////////////////////// + typedef typename FermionOperatorF::GaugeField GaugeFieldF; + typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF; + typedef typename FermionOperatorD::GaugeField GaugeFieldD; + typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD; + + GridBase * GridPtrF = SinglePrecGrid4; + GridBase * GridPtrD = FermOpD.Umu.Grid(); + GaugeFieldF U_f (GridPtrF); + GaugeLinkFieldF Umu_f(GridPtrF); + // std::cout << " Dim gauge field "<Nd()<Nd()<(FermOpD.Umu, mu); + precisionChange(Umu_f,Umu_d); + PokeIndex(FermOpF.Umu, Umu_f, mu); + } + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// +#if 1 + RealD delta=1.e-4; + std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); +#else + std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); +#endif + MPCG(src,psi); + } + }; + +NAMESPACE_END(Grid); + + +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + + CartesianCommunicator::BarrierWorld(); + std::cout << GridLogMessage << " Clock skew check" < HMCWrapper; + // MD.name = std::string("Leap Frog"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("Force Gradient"); + //typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("MinimumNorm2"); + // MD.MDsteps = 4; + MD.MDsteps = 4; + MD.trajL = 0.5; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 1; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("ColdStart"); + HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_DDHMC_lat"; + CPparams.rng_prefix = "ckpoint_DDHMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + std::cout << "loaded NERSC checpointer"< PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + Real beta = 2.13; + // Real light_mass = 5.4e-4; + Real light_mass = 7.8e-4; + Real strange_mass = 0.0362; + Real pv_mass = 1.0; + // std::vector hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + std::vector hasenbusch({ light_mass, 5e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); + + + OneFlavourRationalParams OFRp; // Up/down + OFRp.lo = 2.0e-5; + OFRp.hi = 90.0; + OFRp.MaxIter = 60000; + OFRp.tolerance= 1.0e-5; + OFRp.mdtolerance= 1.0e-3; + // OFRp.degree = 20; converges + // OFRp.degree = 16; + OFRp.degree = 12; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + std::vector ActionTolByPole({ + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-10,1.0e-10,1.0e-10,1.0e-10, + 1.0e-10,1.0e-10,1.0e-10,1.0e-10 + }); + std::vector MDTolByPole({ + 3.0e-5,1.0e-5,3.0e-6,1.0e-6, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-10,1.0e-10,1.0e-10,1.0e-10 + }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + typedef SchurDiagMooeeOperator LinearOperatorEOFAF; + typedef SchurDiagMooeeOperator LinearOperatorEOFAD; + typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG; + typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG_EOFA; + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + Coordinate CommDim(Nd); + for(int d=0;d1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + + Coordinate Block4(Nd); + Block4[0] = Dirichlet[1]; + Block4[1] = Dirichlet[2]; + Block4[2] = Dirichlet[3]; + Block4[3] = Dirichlet[4]; + + int Width=3; + TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4,Width)); + + ////////////////////////// + // Fermion Grids + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd()); + auto GridPtrF = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi); + auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF); + auto FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF); + auto FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + LatticeGaugeFieldF UF(GridPtrF); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + std::cout << "loaded NERSC gauge field"< boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + Params.dirichlet=NonDirichlet; + FermionAction::ImplParams ParamsDir(boundary); + ParamsDir.dirichlet=Dirichlet; + + // double StoppingCondition = 1e-14; + // double MDStoppingCondition = 1e-9; + double StoppingCondition = 1e-8; + double MDStoppingCondition = 1e-7; + double MDStoppingConditionLoose = 1e-7; + double MDStoppingConditionStrange = 1e-7; + double MaxCGIterations = 300000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(2); + ActionLevel Level3(30); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + // Probably dominates the force - back to EOFA. + OneFlavourRationalParams SFRp; + SFRp.lo = 0.1; + SFRp.hi = 25.0; + SFRp.MaxIter = 10000; + SFRp.tolerance= 1.0e-5; + SFRp.mdtolerance= 2.0e-4; + SFRp.degree = 14; + SFRp.precision= 50; + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ConjugateGradient ActionCG(StoppingCondition,MaxCGIterations); + ConjugateGradient DerivativeCG(MDStoppingCondition,MaxCGIterations); + LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); + LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); + LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF); + LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF); + + const int MX_inner = 1000; + MxPCG_EOFA ActionCGL(StoppingCondition, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_LF,Strange_Op_L, + Strange_LinOp_LF,Strange_LinOp_L); + + MxPCG_EOFA DerivativeCGL(MDStoppingConditionStrange, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_LF,Strange_Op_L, + Strange_LinOp_LF,Strange_LinOp_L); + + MxPCG_EOFA ActionCGR(StoppingCondition, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_RF,Strange_Op_R, + Strange_LinOp_RF,Strange_LinOp_R); + + MxPCG_EOFA DerivativeCGR(MDStoppingConditionStrange, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_RF,Strange_Op_R, + Strange_LinOp_RF,Strange_LinOp_R); + + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + ActionCG, + ActionCGL, ActionCGR, + DerivativeCGL, DerivativeCGR, + SFRp, true); + // Level2.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + std::vector dirichlet_den; + std::vector dirichlet_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); dirichlet_den.push_back(0); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector DenominatorsF; + std::vector *> Quotients; + std::vector *> Bdys; + std::vector ActionMPCG; + std::vector MPCG; + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + std::vector LinOpD; + std::vector LinOpF; + + for(int h=0;h(*Numerators[h],*Denominators[h],MDCG,CG)); + Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); + } else { + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + } + } + for(int h=0;hSetTolerances(ActionTolByPole,MDTolByPole); + } + int nquo=Quotients.size(); + Level2.push_back(Bdys[0]); + Level2.push_back(Bdys[1]); + for(int h=0;h +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +template + class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + Integer TotalInnerIterations; //Number of inner CG iterations + Integer TotalOuterIterations; //Number of restarts + Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + + MixedPrecisionConjugateGradientOperatorFunction(RealD tol, + Integer maxinnerit, + Integer maxouterit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + InnerTolerance(tol), + MaxInnerIterations(maxinnerit), + MaxOuterIterations(maxouterit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5), + OuterLoopNormMult(100.) + { + /* Debugging instances of objects; references are stored + std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " < &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<(&LinOpU); + + // std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <_Mat)<_Mat)==&(LinOpD._Mat)); + + //////////////////////////////////////////////////////////////////////////////////// + // Must snarf a single precision copy of the gauge field in Linop_d argument + //////////////////////////////////////////////////////////////////////////////////// + typedef typename FermionOperatorF::GaugeField GaugeFieldF; + typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF; + typedef typename FermionOperatorD::GaugeField GaugeFieldD; + typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD; + + GridBase * GridPtrF = SinglePrecGrid4; + GridBase * GridPtrD = FermOpD.Umu.Grid(); + GaugeFieldF U_f (GridPtrF); + GaugeLinkFieldF Umu_f(GridPtrF); + // std::cout << " Dim gauge field "<Nd()<Nd()<(FermOpD.Umu, mu); + precisionChange(Umu_f,Umu_d); + PokeIndex(FermOpF.Umu, Umu_f, mu); + } + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// +#if 1 + RealD delta=1.e-4; + std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); +#else + std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); +#endif + MPCG(src,psi); + } + }; + +NAMESPACE_END(Grid); + + +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + + CartesianCommunicator::BarrierWorld(); + std::cout << GridLogMessage << " Clock skew check" < HMCWrapper; + // MD.name = std::string("Leap Frog"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("Force Gradient"); + //typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("MinimumNorm2"); + // MD.MDsteps = 4; + MD.MDsteps = 4; + MD.trajL = 0.5; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 1; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("ColdStart"); + HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_DDHMC_lat"; + CPparams.rng_prefix = "ckpoint_DDHMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + std::cout << "loaded NERSC checpointer"< PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + Real beta = 2.13; + // Real light_mass = 5.4e-4; + Real light_mass = 7.8e-4; + Real strange_mass = 0.0362; + Real pv_mass = 1.0; + std::vector hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); + + + OneFlavourRationalParams OFRp; // Up/down + OFRp.lo = 4.0e-5; + OFRp.hi = 90.0; + OFRp.MaxIter = 60000; + OFRp.tolerance= 1.0e-5; + OFRp.mdtolerance= 1.0e-3; + // OFRp.degree = 20; converges + // OFRp.degree = 16; + OFRp.degree = 12; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + std::vector ActionTolByPole({ + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8 + }); + std::vector MDTolByPole({ + 1.0e-6,3.0e-7,1.0e-7,1.0e-7, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8 + }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + typedef SchurDiagMooeeOperator LinearOperatorD2; + typedef SchurDiagMooeeOperator LinearOperatorEOFAF; + typedef SchurDiagMooeeOperator LinearOperatorEOFAD; + typedef SchurDiagMooeeOperator LinearOperatorEOFAD2; + typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG; + typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG_EOFA; + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + Coordinate CommDim(Nd); + for(int d=0;d1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + //Dirichlet[1] = 0; + //Dirichlet[2] = 0; + //Dirichlet[3] = 0; + + // + Coordinate Block4(Nd); + Block4[0] = Dirichlet[1]; + Block4[1] = Dirichlet[2]; + Block4[2] = Dirichlet[3]; + Block4[3] = Dirichlet[4]; + + int Width=4; + TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4,Width)); + + ////////////////////////// + // Fermion Grids + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd()); + auto GridPtrF = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi); + auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF); + auto FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF); + auto FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeFieldD U(GridPtr); U=Zero(); + LatticeGaugeFieldF UF(GridPtrF); UF=Zero(); + LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero(); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + std::cout << "loaded NERSC gauge field"< boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + FermionAction::ImplParams ParamsDir(boundary); + FermionActionF::ImplParams ParamsF(boundary); + FermionActionF::ImplParams ParamsDirF(boundary); + Params.dirichlet=NonDirichlet; + ParamsF.dirichlet=NonDirichlet; + ParamsDir.dirichlet=Dirichlet; + ParamsDirF.dirichlet=Dirichlet; + + // double StoppingCondition = 1e-14; + // double MDStoppingCondition = 1e-9; + double StoppingCondition = 1e-8; + double MDStoppingCondition = 1e-7; + double MDStoppingConditionLoose = 1e-7; + double MDStoppingConditionStrange = 1e-7; + double MaxCGIterations = 300000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(2); + ActionLevel Level3(30); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + // Probably dominates the force - back to EOFA. + OneFlavourRationalParams SFRp; + SFRp.lo = 0.25; + SFRp.hi = 25.0; + SFRp.MaxIter = 10000; + SFRp.tolerance= 1.0e-5; + SFRp.mdtolerance= 2.0e-4; + SFRp.degree = 8; + SFRp.precision= 50; + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ConjugateGradient ActionCG(StoppingCondition,MaxCGIterations); + ConjugateGradient DerivativeCG(MDStoppingCondition,MaxCGIterations); + LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); + LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); + LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF); + LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF); + + const int MX_inner = 1000; + MxPCG_EOFA ActionCGL(StoppingCondition, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_LF,Strange_Op_L, + Strange_LinOp_LF,Strange_LinOp_L); + + MxPCG_EOFA DerivativeCGL(MDStoppingConditionStrange, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_LF,Strange_Op_L, + Strange_LinOp_LF,Strange_LinOp_L); + + MxPCG_EOFA ActionCGR(StoppingCondition, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_RF,Strange_Op_R, + Strange_LinOp_RF,Strange_LinOp_R); + + MxPCG_EOFA DerivativeCGR(MDStoppingConditionStrange, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_RF,Strange_Op_R, + Strange_LinOp_RF,Strange_LinOp_R); + + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + ActionCG, + ActionCGL, ActionCGR, + DerivativeCGL, DerivativeCGR, + SFRp, true); + // Level2.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + std::vector dirichlet_den; + std::vector dirichlet_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); dirichlet_den.push_back(0); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector NumeratorsF; + std::vector DenominatorsF; + std::vector NumeratorsD2; + std::vector DenominatorsD2; + std::vector *> Quotients; + std::vector ActionMPCG; + std::vector MPCG; + +#define MIXED_PRECISION +#ifdef MIXED_PRECISION + std::vector *> Bdys; +#else + std::vector *> Bdys; +#endif + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + std::vector LinOpD; + std::vector LinOpF; + + for(int h=0;h(*Numerators[h],*Denominators[h],MDCG,CG)); + Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); + } else { +#ifdef MIXED_PRECISION + // Use the D2 data types and make them use same grid as single + FermionActionD2::ImplParams ParamsDenD2(boundary); + FermionActionD2::ImplParams ParamsNumD2(boundary); + + ParamsDenD2.dirichlet = ParamsDen.dirichlet; + DenominatorsD2.push_back(new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenD2)); + + ParamsNumD2.dirichlet = ParamsNum.dirichlet; + NumeratorsD2.push_back (new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumD2)); + + Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction( + *Numerators[h],*Denominators[h], + *NumeratorsF[h],*DenominatorsF[h], + *NumeratorsD2[h],*DenominatorsD2[h], + OFRp, 200) ); + Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction( + *Numerators[h],*Denominators[h], + *NumeratorsF[h],*DenominatorsF[h], + *NumeratorsD2[h],*DenominatorsD2[h], + OFRp, 200) ); +#else + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); +#endif + } + } + for(int h=0;hSetTolerances(ActionTolByPole,MDTolByPole); + } + int nquo=Quotients.size(); + Level1.push_back(Bdys[0]); + Level1.push_back(Bdys[1]); + for(int h=0;h Date: Tue, 15 Nov 2022 16:39:39 -0500 Subject: [PATCH 182/240] Multilevel integrator test --- HMC/Mobius2p1f_DD_EOFA_96I_3level.cc | 516 +++++++++++++++++++++++++++ 1 file changed, 516 insertions(+) create mode 100644 HMC/Mobius2p1f_DD_EOFA_96I_3level.cc diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_3level.cc b/HMC/Mobius2p1f_DD_EOFA_96I_3level.cc new file mode 100644 index 00000000..c305567c --- /dev/null +++ b/HMC/Mobius2p1f_DD_EOFA_96I_3level.cc @@ -0,0 +1,516 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_hmc_EODWFRatio.cc + +Copyright (C) 2015-2016 + +Author: Peter Boyle +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +template + class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + Integer TotalInnerIterations; //Number of inner CG iterations + Integer TotalOuterIterations; //Number of restarts + Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + + MixedPrecisionConjugateGradientOperatorFunction(RealD tol, + Integer maxinnerit, + Integer maxouterit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + InnerTolerance(tol), + MaxInnerIterations(maxinnerit), + MaxOuterIterations(maxouterit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5), + OuterLoopNormMult(100.) + { + /* Debugging instances of objects; references are stored + std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " < &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<(&LinOpU); + + // std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <_Mat)<_Mat)==&(LinOpD._Mat)); + + //////////////////////////////////////////////////////////////////////////////////// + // Must snarf a single precision copy of the gauge field in Linop_d argument + //////////////////////////////////////////////////////////////////////////////////// + typedef typename FermionOperatorF::GaugeField GaugeFieldF; + typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF; + typedef typename FermionOperatorD::GaugeField GaugeFieldD; + typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD; + + GridBase * GridPtrF = SinglePrecGrid4; + GridBase * GridPtrD = FermOpD.Umu.Grid(); + GaugeFieldF U_f (GridPtrF); + GaugeLinkFieldF Umu_f(GridPtrF); + // std::cout << " Dim gauge field "<Nd()<Nd()<(FermOpD.Umu, mu); + precisionChange(Umu_f,Umu_d); + PokeIndex(FermOpF.Umu, Umu_f, mu); + } + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// +#if 1 + RealD delta=1.e-4; + std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); +#else + std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); +#endif + MPCG(src,psi); + } + }; + +NAMESPACE_END(Grid); + + +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + + CartesianCommunicator::BarrierWorld(); + std::cout << GridLogMessage << " Clock skew check" < HMCWrapper; + // MD.name = std::string("Leap Frog"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Force Gradient"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = 4; + MD.trajL = 0.5; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 1; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("ColdStart"); + HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_DDHMC_lat"; + CPparams.rng_prefix = "ckpoint_DDHMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + std::cout << "loaded NERSC checpointer"< PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + Real beta = 2.13; + // Real light_mass = 5.4e-4; + Real light_mass = 7.8e-4; + Real strange_mass = 0.0362; + Real pv_mass = 1.0; + // std::vector hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + std::vector hasenbusch({ light_mass, 5e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); + + + OneFlavourRationalParams OFRp; // Up/down + OFRp.lo = 2.0e-5; + OFRp.hi = 90.0; + OFRp.MaxIter = 60000; + OFRp.tolerance= 1.0e-5; + OFRp.mdtolerance= 1.0e-3; + // OFRp.degree = 20; converges + // OFRp.degree = 16; + OFRp.degree = 12; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + std::vector ActionTolByPole({ + 1.0e-6,1.0e-6,1.0e-7,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-10,1.0e-10,1.0e-10,1.0e-10 + }); + std::vector MDTolByPole({ + 3.0e-4,3.0e-4,3.0e-5,1.0e-5, + 1.0e-5,1.0e-5,1.0e-5,1.0e-5, + 1.0e-8,1.0e-10,1.0e-10,1.0e-10 + }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + typedef SchurDiagMooeeOperator LinearOperatorEOFAF; + typedef SchurDiagMooeeOperator LinearOperatorEOFAD; + typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG; + typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG_EOFA; + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + Coordinate CommDim(Nd); + for(int d=0;d1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + + Coordinate Block4(Nd); + Block4[0] = Dirichlet[1]; + Block4[1] = Dirichlet[2]; + Block4[2] = Dirichlet[3]; + Block4[3] = Dirichlet[4]; + + int Width=3; + TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4,Width)); + + ////////////////////////// + // Fermion Grids + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd()); + auto GridPtrF = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi); + auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF); + auto FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF); + auto FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + LatticeGaugeFieldF UF(GridPtrF); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + std::cout << "loaded NERSC gauge field"< boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + Params.dirichlet=NonDirichlet; + FermionAction::ImplParams ParamsDir(boundary); + ParamsDir.dirichlet=Dirichlet; + + // double StoppingCondition = 1e-14; + // double MDStoppingCondition = 1e-9; + double StoppingCondition = 1e-8; + double MDStoppingCondition = 1e-6; + double MDStoppingConditionLoose = 1e-6; + double MDStoppingConditionStrange = 1e-8; + double MaxCGIterations = 300000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(8); // 6 x 20 = 120 = 8 x 15 + ActionLevel Level3(15); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + // Probably dominates the force - back to EOFA. + OneFlavourRationalParams SFRp; + SFRp.lo = 0.1; + SFRp.hi = 25.0; + SFRp.MaxIter = 10000; + SFRp.tolerance= 1.0e-5; + SFRp.mdtolerance= 1.0e-3; + SFRp.degree = 14; + SFRp.precision= 50; + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ConjugateGradient ActionCG(StoppingCondition,MaxCGIterations); + ConjugateGradient DerivativeCG(MDStoppingCondition,MaxCGIterations); + LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); + LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); + LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF); + LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF); + + const int MX_inner = 1000; + MxPCG_EOFA ActionCGL(StoppingCondition, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_LF,Strange_Op_L, + Strange_LinOp_LF,Strange_LinOp_L); + + MxPCG_EOFA DerivativeCGL(MDStoppingConditionStrange, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_LF,Strange_Op_L, + Strange_LinOp_LF,Strange_LinOp_L); + + MxPCG_EOFA ActionCGR(StoppingCondition, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_RF,Strange_Op_R, + Strange_LinOp_RF,Strange_LinOp_R); + + MxPCG_EOFA DerivativeCGR(MDStoppingConditionStrange, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_RF,Strange_Op_R, + Strange_LinOp_RF,Strange_LinOp_R); + + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + ActionCG, + ActionCGL, ActionCGR, + DerivativeCGL, DerivativeCGR, + SFRp, true); + Level2.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + std::vector dirichlet_den; + std::vector dirichlet_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); dirichlet_den.push_back(0); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector DenominatorsF; + std::vector *> Quotients; + std::vector *> Bdys; + std::vector ActionMPCG; + std::vector MPCG; + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + std::vector LinOpD; + std::vector LinOpF; + + for(int h=0;h(*Numerators[h],*Denominators[h],MDCG,CG)); + Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); + } else { + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + } + } + for(int h=0;hSetTolerances(ActionTolByPole,MDTolByPole); + } + int nquo=Quotients.size(); + // Level1.push_back(Bdys[0]); + // Level1.push_back(Bdys[1]); + // Level2.push_back(Bdys[0]); + // Level2.push_back(Bdys[1]); + for(int h=0;h Date: Tue, 15 Nov 2022 16:40:38 -0500 Subject: [PATCH 183/240] Partial Dirichlet test --- benchmarks/Benchmark_dwf_fp32_partial.cc | 463 +++++++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100644 benchmarks/Benchmark_dwf_fp32_partial.cc diff --git a/benchmarks/Benchmark_dwf_fp32_partial.cc b/benchmarks/Benchmark_dwf_fp32_partial.cc new file mode 100644 index 00000000..4db3022a --- /dev/null +++ b/benchmarks/Benchmark_dwf_fp32_partial.cc @@ -0,0 +1,463 @@ + /************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + Source file: ./benchmarks/Benchmark_dwf.cc + Copyright (C) 2015 + + Author: Peter Boyle + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +//////////////////////// +/// Move to domains //// +//////////////////////// + +Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT +}; + +void Benchmark(int Ls, Coordinate Dirichlet, int partial); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + int threads = GridThread::GetThreads(); + + int Ls=8; + for(int i=0;i> Ls; + } + } + + ////////////////// + // With comms + ////////////////// + Coordinate Dirichlet(Nd+1,0); + + for(auto partial : {0}) { + std::cout << "\n\n\n\n\n\n" <1 ? 1 : 0; + Dirichlet[0] = 0; + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + + for(auto partial : {0,1}) { + std::cout << "\n\n\n\n\n\n" <1 ? 1 : 0; + Dirichlet[0] = 0; + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; + + for(auto partial : {0,1}) { + std::cout << "\n\n\n\n\n\n" < seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); +#define SINGLE +#ifdef SINGLE + typedef vComplexF Simd; + typedef LatticeFermionF FermionField; + typedef LatticeGaugeFieldF GaugeField; + typedef LatticeColourMatrixF ColourMatrixField; + typedef DomainWallFermionF FermionAction; +#endif +#ifdef DOUBLE + typedef vComplexD Simd; + typedef LatticeFermionD FermionField; + typedef LatticeGaugeFieldD GaugeField; + typedef LatticeColourMatrixD ColourMatrixField; + typedef DomainWallFermionD FermionAction; +#endif +#ifdef DOUBLE2 + typedef vComplexD2 Simd; + typedef LatticeFermionD2 FermionField; + typedef LatticeGaugeFieldD2 GaugeField; + typedef LatticeColourMatrixD2 ColourMatrixField; + typedef DomainWallFermionD2 FermionAction; +#endif + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + + + FermionField src (FGrid); random(RNG5,src); +#if 0 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4,Umu); + UmuCopy=Umu; + UmuFull=Umu; + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; + + //////////////////////////////////// + // Apply BCs + //////////////////////////////////// + Coordinate Block(4); + for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1]; + + std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl; + std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl; + + DirichletFilter Filter(Block); + Filter.applyFilter(Umu); + if(!partial) Filter.applyFilter(UmuCopy); + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + std::vector U(4,UGrid); + std::vector Ucopy(4,UGrid); + for(int mu=0;mu(Umu,mu); + Ucopy[mu] = PeekIndex(UmuCopy,mu); + } + + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; + + if (1) + { + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); + Dw.Dhop(src,result,0); + std::cout<Barrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + std::cout<Barrier(); + + DumpSliceNorm("s-slice ref ",ref,1); + DumpSliceNorm("s-slice res ",result,1); + DumpSliceNorm("s-slice error ",err,1); + exit(-1); + } + assert (n2e< 1.0e-4 ); + } + + if (1) + { // Naive wilson dag implementation + + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;sBarrier(); + Dw.DhopEO(src_o,r_e,DaggerNo); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu Date: Tue, 15 Nov 2022 17:00:49 -0500 Subject: [PATCH 184/240] Benchmark_comms fix --- benchmarks/Benchmark_comms.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 5ffca2b6..00526893 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -412,7 +412,7 @@ int main (int argc, char ** argv) } int tid = omp_get_thread_num(); tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1, - (void *)&rbuf[dir][0], recv_from_rank,1, bytes,bytes,tid); + (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid); thread_critical { dbytes+=tbytes; } } From 6209120de9d861576eb0ce0f1ef706c445fb2a2a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 17:25:58 -0500 Subject: [PATCH 185/240] Fix to GPU compile attempt --- Grid/stencil/SimpleCompressor.h | 2 +- Grid/stencil/Stencil.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index eda6d9e7..ffbe752c 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -73,7 +73,7 @@ public: auto vp0= &mm.vpointers[0][0]; auto vp1= &mm.vpointers[1][0]; auto type= mm.type; - accelerator_forNB(o,mm.buffer_size/2,vobj::Nsimd(),{ + accelerator_forNB(o,mm.buffer_size/2,Merger::Nsimd(),{ decompress.Exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); }); } diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index dd8b646e..1e0b5028 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -245,6 +245,7 @@ public: Integer rbytes; }; struct Merge { + static constexpr int Nsimd = vobj::Nsimd(); cobj * mpointer; Vector rpointers; Vector vpointers; @@ -254,6 +255,7 @@ public: Coordinate dims; }; struct Decompress { + static constexpr int Nsimd = vobj::Nsimd(); cobj * kernel_p; cobj * mpi_p; Integer buffer_size; From ddad25211b4c3db0b922fd8ff43a22302dd5a26b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 17:47:52 -0500 Subject: [PATCH 186/240] Extra instantiations --- .../WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc | 1 + .../ContinuedFractionFermion5DInstantiationWilsonImplD2.cc | 1 + .../DomainWallEOFAFermionInstantiationWilsonImplD2.cc | 1 + .../WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc | 1 + .../PartialFractionFermion5DInstantiationWilsonImplD2.cc | 1 + .../WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc | 1 + .../WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc | 1 + .../WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc | 1 + .../WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc | 1 + .../WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc | 1 + Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h | 1 + .../ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc | 1 + .../ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc | 1 + .../DomainWallEOFAFermionInstantiationZWilsonImplD2.cc | 1 + .../ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc | 1 + .../PartialFractionFermion5DInstantiationZWilsonImplD2.cc | 1 + .../ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc | 1 + .../ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc | 1 + Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h | 1 + 19 files changed, 19 insertions(+) create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc create mode 100644 Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h create mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc create mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc create mode 100644 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc new file mode 120000 index 00000000..cb1db625 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc @@ -0,0 +1 @@ +../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc new file mode 120000 index 00000000..c2d4b8fc --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc @@ -0,0 +1 @@ +../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc new file mode 120000 index 00000000..2f550a2b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc @@ -0,0 +1 @@ +../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc new file mode 120000 index 00000000..7a8f1172 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc @@ -0,0 +1 @@ +../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc new file mode 120000 index 00000000..7f4cea71 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc @@ -0,0 +1 @@ +../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc new file mode 120000 index 00000000..9cc05107 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc @@ -0,0 +1 @@ +../WilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc new file mode 120000 index 00000000..804d0884 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc @@ -0,0 +1 @@ +../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc new file mode 120000 index 00000000..5f6ab65e --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc @@ -0,0 +1 @@ +../WilsonFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc new file mode 120000 index 00000000..01c35e7b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc @@ -0,0 +1 @@ +../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc new file mode 120000 index 00000000..d5789bcf --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc @@ -0,0 +1 @@ +../WilsonTMFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h new file mode 100644 index 00000000..a836ff03 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h @@ -0,0 +1 @@ +#define IMPLEMENTATION WilsonImplD2 diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc new file mode 120000 index 00000000..cb1db625 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc @@ -0,0 +1 @@ +../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc new file mode 120000 index 00000000..c2d4b8fc --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc @@ -0,0 +1 @@ +../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc new file mode 120000 index 00000000..2f550a2b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc @@ -0,0 +1 @@ +../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc new file mode 120000 index 00000000..7a8f1172 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc @@ -0,0 +1 @@ +../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc new file mode 120000 index 00000000..7f4cea71 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc @@ -0,0 +1 @@ +../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc new file mode 120000 index 00000000..804d0884 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc @@ -0,0 +1 @@ +../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc new file mode 120000 index 00000000..01c35e7b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc @@ -0,0 +1 @@ +../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h new file mode 100644 index 00000000..067d6080 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h @@ -0,0 +1 @@ +#define IMPLEMENTATION ZWilsonImplD2 From e2a938e7f78b52fd606a2879d91ced6902179645 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 17:48:18 -0500 Subject: [PATCH 187/240] GPU happy for compile...? --- Grid/qcd/action/fermion/WilsonCompressor.h | 2 +- Grid/stencil/SimpleCompressor.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index e2ced552..9ecb1c49 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -141,7 +141,7 @@ public: auto vp1= &mm.vpointers[1][0]; auto type= mm.type; int nnum = num/Ls; - accelerator_forNB(o,num,vobj::Nsimd(),{ + accelerator_forNB(o,num,Merger::Nsimd,{ int s=o%Ls; int hxyz=o/Ls; // xyzt related component diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index ffbe752c..ccbfdb29 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -73,7 +73,7 @@ public: auto vp0= &mm.vpointers[0][0]; auto vp1= &mm.vpointers[1][0]; auto type= mm.type; - accelerator_forNB(o,mm.buffer_size/2,Merger::Nsimd(),{ + accelerator_forNB(o,mm.buffer_size/2,Merger::Nsimd,{ decompress.Exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); }); } From e51eaedc568fd678132e94bb3d70938048508fea Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Nov 2022 22:58:30 -0500 Subject: [PATCH 188/240] Making tests compile --- Grid/qcd/action/ActionParams.h | 9 ++++++++ .../action/scalar/ScalarInteractionAction.h | 6 ++--- Grid/qcd/hmc/HMC.h | 7 +++--- Grid/qcd/modules/Registration.h | 12 +++++----- Grid/stencil/Stencil.h | 2 +- systems/Crusher/config-command | 3 +-- tests/core/Test_cf_coarsen_support.cc | 8 +++---- tests/core/Test_checker.cc | 4 ++-- tests/core/Test_contfrac_even_odd.cc | 16 +++++++------- tests/core/Test_dwf_eofa_even_odd.cc | 4 ++-- tests/core/Test_dwf_even_odd.cc | 4 ++-- tests/core/Test_gpwilson_even_odd.cc | 8 +++---- tests/core/Test_mobius_eofa_even_odd.cc | 4 ++-- tests/core/Test_mobius_even_odd.cc | 6 ++--- tests/core/Test_staggered.cc | 10 ++++----- tests/core/Test_staggered5D.cc | 10 ++++----- tests/core/Test_staggered_naive.cc | 10 ++++----- tests/core/Test_wilson_clover.cc | 20 ++++++++--------- tests/core/Test_wilson_conserved_current.cc | 22 +++++++++---------- tests/core/Test_wilson_even_odd.cc | 4 ++-- tests/core/Test_wilson_exp_clover.cc | 20 ++++++++--------- .../core/Test_wilson_twisted_mass_even_odd.cc | 4 ++-- tests/core/Test_zmobius_even_odd.cc | 4 ++-- tests/forces/Test_contfrac_force.cc | 2 +- tests/forces/Test_dwf_force.cc | 2 +- tests/forces/Test_dwf_force_eofa.cc | 4 ++-- tests/forces/Test_dwf_gpforce.cc | 6 ++--- tests/forces/Test_dwf_gpforce_eofa.cc | 2 +- tests/forces/Test_gpdwf_force.cc | 6 ++--- tests/forces/Test_gpdwf_force_1f_2f.cc | 4 ++-- tests/forces/Test_gpwilson_force.cc | 6 ++--- tests/forces/Test_mobius_force.cc | 2 +- tests/forces/Test_mobius_force_eofa.cc | 4 ++-- tests/forces/Test_mobius_gpforce_eofa.cc | 2 +- tests/forces/Test_partfrac_force.cc | 2 +- tests/forces/Test_wilson_force.cc | 2 +- tests/forces/Test_wilsonclover_force.cc | 2 +- tests/forces/Test_zmobius_force.cc | 2 +- tests/hmc/Test_action_dwf_gparity2fvs1f.cc | 8 +++---- tests/hmc/Test_hmc_EODWFRatio.cc | 9 +------- tests/hmc/Test_hmc_EODWFRatio_Gparity.cc | 8 +------ tests/hmc/Test_hmc_EOMobiusRatio.cc | 9 +------- .../hmc/Test_hmc_EOMobiusRatioManyFlavour.cc | 8 +------ .../Test_hmc_EOWilsonCloverFermionGauge.cc | 2 +- tests/hmc/Test_hmc_EOWilsonFermionGauge.cc | 2 +- tests/hmc/Test_hmc_EOWilsonRatio.cc | 2 +- tests/hmc/Test_hmc_GparityIwasakiGauge.cc | 2 +- tests/hmc/Test_hmc_GparityWilsonGauge.cc | 2 +- tests/hmc/Test_hmc_Mobius2p1f.cc | 10 ++++----- tests/hmc/Test_hmc_ScalarActionNxN.cc | 3 +-- tests/hmc/Test_hmc_WC2ASFG_Production.cc | 2 +- tests/hmc/Test_hmc_WC2SFG_Production.cc | 2 +- tests/hmc/Test_hmc_WCFG_Production.cc | 2 +- tests/hmc/Test_hmc_WCMixedRepFG_Production.cc | 4 ++-- tests/hmc/Test_hmc_WCadjFG_Production.cc | 2 +- .../hmc/Test_hmc_WilsonAdjointFermionGauge.cc | 2 +- .../hmc/Test_hmc_WilsonCloverFermionGauge.cc | 2 +- tests/hmc/Test_hmc_WilsonFermionGauge.cc | 2 +- tests/hmc/Test_hmc_WilsonGauge.cc | 6 +++-- ..._WilsonMixedRepresentationsFermionGauge.cc | 4 ++-- tests/hmc/Test_hmc_WilsonRatio.cc | 2 +- tests/hmc/Test_hmc_WilsonTMFermionGauge.cc | 2 +- ...hmc_WilsonTwoIndexSymmetricFermionGauge.cc | 2 +- tests/hmc/Test_rhmc_EOWilson1p1.cc | 2 +- tests/hmc/Test_rhmc_EOWilsonRatio.cc | 2 +- tests/hmc/Test_rhmc_Wilson1p1.cc | 2 +- tests/hmc/Test_rhmc_WilsonRatio.cc | 2 +- 67 files changed, 168 insertions(+), 184 deletions(-) diff --git a/Grid/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h index b2a06280..b332a2c3 100644 --- a/Grid/qcd/action/ActionParams.h +++ b/Grid/qcd/action/ActionParams.h @@ -35,6 +35,15 @@ directory NAMESPACE_BEGIN(Grid); // These can move into a params header and be given MacroMagic serialisation +struct DefaultImplParams { + Coordinate dirichlet; // Blocksize of dirichlet BCs + int partialDirichlet; + DefaultImplParams() { + dirichlet.resize(0); + partialDirichlet=0; + }; +}; + struct GparityWilsonImplParams { Coordinate twists; //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs diff --git a/Grid/qcd/action/scalar/ScalarInteractionAction.h b/Grid/qcd/action/scalar/ScalarInteractionAction.h index e04dd486..7708a489 100644 --- a/Grid/qcd/action/scalar/ScalarInteractionAction.h +++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h @@ -47,7 +47,7 @@ private: const unsigned int N = Impl::Group::Dimension; typedef typename Field::vector_object vobj; - typedef CartesianStencil Stencil; + typedef CartesianStencil Stencil; SimpleCompressor compressor; int npoint = 2 * Ndim; @@ -82,7 +82,7 @@ public: virtual RealD S(const Field &p) { assert(p.Grid()->Nd() == Ndim); - static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements,0); + static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements); phiStencil.HaloExchange(p, compressor); Field action(p.Grid()), pshift(p.Grid()), phisquared(p.Grid()); phisquared = p * p; @@ -133,7 +133,7 @@ public: double interm_t = usecond(); // move this outside - static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements,0); + static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements); phiStencil.HaloExchange(p, compressor); double halo_t = usecond(); diff --git a/Grid/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h index 745e53c8..d4739fb0 100644 --- a/Grid/qcd/hmc/HMC.h +++ b/Grid/qcd/hmc/HMC.h @@ -143,6 +143,7 @@ private: GridBase *Grid = U.Grid(); if(Params.PerformRandomShift){ +#if 0 ////////////////////////////////////////////////////////////////////////////////////////////////////// // Mainly for DDHMC perform a random translation of U modulo volume ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -167,11 +168,11 @@ private: //shift all fields together in a way that respects the gauge BCs for(int mu=0; mu < Grid->Nd(); mu++) Umu[mu] = FieldImplementation::CshiftLink(Umu[mu],d,shift); - } - for(int mu=0;muNd();mu++) PokeIndex(U,Umu[mu],mu); - + for(int mu=0;muNd();mu++) PokeIndex(U,Umu[mu],mu); + } std::cout << GridLogMessage << "--------------------------------------------------\n"; +#endif } TheIntegrator.reset_timer(); diff --git a/Grid/qcd/modules/Registration.h b/Grid/qcd/modules/Registration.h index 28a9fdae..db43abe1 100644 --- a/Grid/qcd/modules/Registration.h +++ b/Grid/qcd/modules/Registration.h @@ -78,13 +78,13 @@ static Registrar, // Now a specific registration with a fermion field // here must instantiate CG and CR for every new fermion field type (macro!!) -static Registrar< ConjugateGradientModule, - HMC_SolverModuleFactory > __CGWFmodXMLInit("ConjugateGradient"); +static Registrar< ConjugateGradientModule, + HMC_SolverModuleFactory > __CGWFmodXMLInit("ConjugateGradient"); -static Registrar< BiCGSTABModule, - HMC_SolverModuleFactory > __BiCGWFmodXMLInit("BiCGSTAB"); -static Registrar< ConjugateResidualModule, - HMC_SolverModuleFactory > __CRWFmodXMLInit("ConjugateResidual"); +static Registrar< BiCGSTABModule, + HMC_SolverModuleFactory > __BiCGWFmodXMLInit("BiCGSTAB"); +static Registrar< ConjugateResidualModule, + HMC_SolverModuleFactory > __CRWFmodXMLInit("ConjugateResidual"); // add the staggered, scalar versions here diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1e0b5028..ff8a6433 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -698,7 +698,7 @@ public: int checkerboard, const std::vector &directions, const std::vector &distances, - Parameters p) + Parameters p=Parameters()) { face_table_computed=0; _grid = grid; diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index 57b93e03..ca1d6348 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -4,11 +4,10 @@ --enable-accelerator=hip \ --enable-gen-simd-width=64 \ --enable-simd=GPU \ ---disable-fermion-reps \ --with-gmp=$OLCF_GMP_ROOT \ --with-fftw=$FFTW_DIR/.. \ --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ ---enable-gparity \ +--disable-gparity \ CXX=hipcc MPICXX=mpicxx \ CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib64 " \ LDFLAGS="-L/lib64 -L/opt/rocm-5.2.0/lib/ -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 " diff --git a/tests/core/Test_cf_coarsen_support.cc b/tests/core/Test_cf_coarsen_support.cc index ad0309b9..0812ab7e 100644 --- a/tests/core/Test_cf_coarsen_support.cc +++ b/tests/core/Test_cf_coarsen_support.cc @@ -75,8 +75,8 @@ int main (int argc, char ** argv) RealD M5=1.8; { - OverlapWilsonContFracTanhFermionR Dcf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); - HermitianLinearOperator HermIndefOp(Dcf); + OverlapWilsonContFracTanhFermionD Dcf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); + HermitianLinearOperator HermIndefOp(Dcf); HermIndefOp.Op(src,ref); HermIndefOp.OpDiag(src,result); @@ -92,8 +92,8 @@ int main (int argc, char ** argv) } { - OverlapWilsonPartialFractionTanhFermionR Dpf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); - HermitianLinearOperator HermIndefOp(Dpf); + OverlapWilsonPartialFractionTanhFermionD Dpf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); + HermitianLinearOperator HermIndefOp(Dpf); HermIndefOp.Op(src,ref); HermIndefOp.OpDiag(src,result); diff --git a/tests/core/Test_checker.cc b/tests/core/Test_checker.cc index f87133e5..c2382e91 100644 --- a/tests/core/Test_checker.cc +++ b/tests/core/Test_checker.cc @@ -140,14 +140,14 @@ int main (int argc, char ** argv) // RealD mass=0.1; // RealD M5=1.8; - // DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + // DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); // LatticeFermion src_o(FrbGrid); // LatticeFermion result_o(FrbGrid); // pickCheckerboard(Odd,src_o,src); // result_o=Zero(); - // SchurDiagMooeeOperator HermOpEO(Ddwf); + // SchurDiagMooeeOperator HermOpEO(Ddwf); // ConjugateGradient CG(1.0e-8,10000); // CG(HermOpEO,src_o,result_o); diff --git a/tests/core/Test_contfrac_even_odd.cc b/tests/core/Test_contfrac_even_odd.cc index 42bfe361..5731719a 100644 --- a/tests/core/Test_contfrac_even_odd.cc +++ b/tests/core/Test_contfrac_even_odd.cc @@ -76,20 +76,20 @@ int main (int argc, char ** argv) RealD M5 =1.8; std::cout<(Dcf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonContFracTanhFermionD Dcf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); + TestWhat(Dcf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dcfz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonContFracZolotarevFermionD Dcfz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,6.0); + TestWhat(Dcfz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dpf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonPartialFractionTanhFermionD Dpf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); + TestWhat(Dpf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dpfz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonPartialFractionZolotarevFermionD Dpfz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,6.0); + TestWhat(Dpfz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); Grid_finalize(); } diff --git a/tests/core/Test_dwf_eofa_even_odd.cc b/tests/core/Test_dwf_eofa_even_odd.cc index 7812ebb8..532c740a 100644 --- a/tests/core/Test_dwf_eofa_even_odd.cc +++ b/tests/core/Test_dwf_eofa_even_odd.cc @@ -90,7 +90,7 @@ int main (int argc, char ** argv) RealD shift = 0.1234; RealD M5 = 1.8; int pm = 1; - DomainWallEOFAFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mq1, mq2, mq3, shift, pm, M5); + DomainWallEOFAFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mq1, mq2, mq3, shift, pm, M5); LatticeFermion src_e (FrbGrid); LatticeFermion src_o (FrbGrid); @@ -216,7 +216,7 @@ int main (int argc, char ** argv) pickCheckerboard(Even, phi_e, phi); pickCheckerboard(Odd , phi_o, phi); - SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); HermOpEO.MpcDagMpc(chi_e, dchi_e); HermOpEO.MpcDagMpc(chi_o, dchi_o); diff --git a/tests/core/Test_dwf_even_odd.cc b/tests/core/Test_dwf_even_odd.cc index 924eb3b7..f915b439 100644 --- a/tests/core/Test_dwf_even_odd.cc +++ b/tests/core/Test_dwf_even_odd.cc @@ -86,7 +86,7 @@ int main (int argc, char ** argv) RealD mass=0.1; RealD M5 =1.8; - DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); LatticeFermion src_e (FrbGrid); LatticeFermion src_o (FrbGrid); @@ -213,7 +213,7 @@ int main (int argc, char ** argv) pickCheckerboard(Even,phi_e,phi); pickCheckerboard(Odd ,phi_o,phi); - SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); HermOpEO.MpcDagMpc(chi_e,dchi_e); HermOpEO.MpcDagMpc(chi_o,dchi_o); diff --git a/tests/core/Test_gpwilson_even_odd.cc b/tests/core/Test_gpwilson_even_odd.cc index d510657e..c8587435 100644 --- a/tests/core/Test_gpwilson_even_odd.cc +++ b/tests/core/Test_gpwilson_even_odd.cc @@ -52,7 +52,7 @@ int main (int argc, char ** argv) // pRNG.SeedFixedIntegers(seeds); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - typedef typename GparityWilsonFermionR::FermionField FermionField; + typedef typename GparityWilsonFermionD::FermionField FermionField; FermionField src (&Grid); random(pRNG,src); FermionField phi (&Grid); random(pRNG,phi); @@ -80,10 +80,10 @@ int main (int argc, char ** argv) RealD mass=0.1; - GparityWilsonFermionR::ImplParams params; + GparityWilsonFermionD::ImplParams params; std::vector twists(Nd,0); twists[1] = 1; params.twists = twists; - GparityWilsonFermionR Dw(Umu,Grid,RBGrid,mass,params); + GparityWilsonFermionD Dw(Umu,Grid,RBGrid,mass,params); FermionField src_e (&RBGrid); FermionField src_o (&RBGrid); @@ -199,7 +199,7 @@ int main (int argc, char ** argv) pickCheckerboard(Even,phi_e,phi); pickCheckerboard(Odd ,phi_o,phi); - SchurDiagMooeeOperator HermOpEO(Dw); + SchurDiagMooeeOperator HermOpEO(Dw); HermOpEO.MpcDagMpc(chi_e,dchi_e); HermOpEO.MpcDagMpc(chi_o,dchi_o); diff --git a/tests/core/Test_mobius_eofa_even_odd.cc b/tests/core/Test_mobius_eofa_even_odd.cc index 68ffe624..65d55896 100644 --- a/tests/core/Test_mobius_eofa_even_odd.cc +++ b/tests/core/Test_mobius_eofa_even_odd.cc @@ -92,7 +92,7 @@ int main (int argc, char ** argv) RealD shift = 0.1234; RealD M5 = 1.8; int pm = 1; - MobiusEOFAFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mq1, mq2, mq3, shift, pm, M5, b, c); + MobiusEOFAFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mq1, mq2, mq3, shift, pm, M5, b, c); LatticeFermion src_e (FrbGrid); LatticeFermion src_o (FrbGrid); @@ -218,7 +218,7 @@ int main (int argc, char ** argv) pickCheckerboard(Even, phi_e, phi); pickCheckerboard(Odd , phi_o, phi); - SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); HermOpEO.MpcDagMpc(chi_e, dchi_e); HermOpEO.MpcDagMpc(chi_o, dchi_o); diff --git a/tests/core/Test_mobius_even_odd.cc b/tests/core/Test_mobius_even_odd.cc index e210f236..91125ac6 100644 --- a/tests/core/Test_mobius_even_odd.cc +++ b/tests/core/Test_mobius_even_odd.cc @@ -108,8 +108,8 @@ int main (int argc, char ** argv) omegas.push_back( std::complex(0.0686324988446592,-0.0550658530827402) ); #endif - MobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, 0.5,0.5); - // DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + MobiusFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, 0.5,0.5); + // DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); LatticeFermion src_e (FrbGrid); LatticeFermion src_o (FrbGrid); @@ -264,7 +264,7 @@ int main (int argc, char ** argv) pickCheckerboard(Odd ,phi_o,phi); - SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); HermOpEO.MpcDagMpc(chi_e,dchi_e); HermOpEO.MpcDagMpc(chi_o,dchi_o); diff --git a/tests/core/Test_staggered.cc b/tests/core/Test_staggered.cc index ba615ad2..f38815ae 100644 --- a/tests/core/Test_staggered.cc +++ b/tests/core/Test_staggered.cc @@ -53,9 +53,9 @@ int main (int argc, char ** argv) pRNG.SeedFixedIntegers(seeds); // pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); - typedef typename ImprovedStaggeredFermionR::FermionField FermionField; - typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; - typename ImprovedStaggeredFermionR::ImplParams params; + typedef typename ImprovedStaggeredFermionD::FermionField FermionField; + typedef typename ImprovedStaggeredFermionD::ComplexField ComplexField; + typename ImprovedStaggeredFermionD::ImplParams params; FermionField src (&Grid); random(pRNG,src); FermionField result(&Grid); result=Zero(); @@ -130,7 +130,7 @@ int main (int argc, char ** argv) // ref = ref + mass * src; } - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params); + ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params); std::cout< HermOpEO(Ds); + SchurDiagMooeeOperator HermOpEO(Ds); HermOpEO.MpcDagMpc(chi_e,dchi_e); HermOpEO.MpcDagMpc(chi_o,dchi_o); diff --git a/tests/core/Test_staggered5D.cc b/tests/core/Test_staggered5D.cc index b1b3be1d..32ad0d17 100644 --- a/tests/core/Test_staggered5D.cc +++ b/tests/core/Test_staggered5D.cc @@ -60,9 +60,9 @@ int main (int argc, char ** argv) pRNG4.SeedFixedIntegers(seeds); pRNG5.SeedFixedIntegers(seeds); - typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; - typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; - typename ImprovedStaggeredFermion5DR::ImplParams params; + typedef typename ImprovedStaggeredFermion5DD::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DD::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DD::ImplParams params; FermionField src (FGrid); @@ -148,7 +148,7 @@ int main (int argc, char ** argv) } } - ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params); + ImprovedStaggeredFermion5DD Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params); std::cout< HermOpEO(Ds); + SchurDiagMooeeOperator HermOpEO(Ds); HermOpEO.MpcDagMpc(chi_e,dchi_e); HermOpEO.MpcDagMpc(chi_o,dchi_o); diff --git a/tests/core/Test_staggered_naive.cc b/tests/core/Test_staggered_naive.cc index d8ca9d5f..9d32ad46 100644 --- a/tests/core/Test_staggered_naive.cc +++ b/tests/core/Test_staggered_naive.cc @@ -52,9 +52,9 @@ int main (int argc, char ** argv) pRNG.SeedFixedIntegers(seeds); // pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); - typedef typename NaiveStaggeredFermionR::FermionField FermionField; - typedef typename NaiveStaggeredFermionR::ComplexField ComplexField; - typename NaiveStaggeredFermionR::ImplParams params; + typedef typename NaiveStaggeredFermionD::FermionField FermionField; + typedef typename NaiveStaggeredFermionD::ComplexField ComplexField; + typename NaiveStaggeredFermionD::ImplParams params; FermionField src (&Grid); random(pRNG,src); FermionField result(&Grid); result=Zero(); @@ -120,7 +120,7 @@ int main (int argc, char ** argv) // ref = ref + mass * src; } - NaiveStaggeredFermionR Ds(Umu,Grid,RBGrid,mass,c1,u0,params); + NaiveStaggeredFermionD Ds(Umu,Grid,RBGrid,mass,c1,u0,params); std::cout< HermOpEO(Ds); + SchurDiagMooeeOperator HermOpEO(Ds); HermOpEO.MpcDagMpc(chi_e,dchi_e); HermOpEO.MpcDagMpc(chi_o,dchi_o); diff --git a/tests/core/Test_wilson_clover.cc b/tests/core/Test_wilson_clover.cc index 8f143070..0ce0513f 100644 --- a/tests/core/Test_wilson_clover.cc +++ b/tests/core/Test_wilson_clover.cc @@ -52,8 +52,8 @@ int main(int argc, char **argv) pRNG.SeedFixedIntegers(seeds); // pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); - typedef typename WilsonCloverFermionR::FermionField FermionField; - typename WilsonCloverFermionR::ImplParams params; + typedef typename WilsonCloverFermionD::FermionField FermionField; + typename WilsonCloverFermionD::ImplParams params; WilsonAnisotropyCoefficients anis; FermionField src(&Grid); @@ -88,8 +88,8 @@ int main(int argc, char **argv) RealD csw_r = 1.0; RealD csw_t = 1.0; - WilsonCloverFermionR Dwc(Umu, Grid, RBGrid, mass, csw_r, csw_t, anis, params); - CompactWilsonCloverFermionR Dwc_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params); + WilsonCloverFermionD Dwc(Umu, Grid, RBGrid, mass, csw_r, csw_t, anis, params); + CompactWilsonCloverFermionD Dwc_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params); std::cout << GridLogMessage << "==========================================================" << std::endl; std::cout << GridLogMessage << "= Testing that Deo + Doe = Dunprec " << std::endl; @@ -324,8 +324,8 @@ int main(int argc, char **argv) } ///////////////// - WilsonCloverFermionR Dwc_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, anis, params); - CompactWilsonCloverFermionR Dwc_compact_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params); + WilsonCloverFermionD Dwc_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, anis, params); + CompactWilsonCloverFermionD Dwc_compact_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params); tmp = Omega * src; pickCheckerboard(Even, src_e, tmp); @@ -377,14 +377,14 @@ int main(int argc, char **argv) chi = Zero(); phi = Zero(); - WilsonFermionR Dw(Umu, Grid, RBGrid, mass, params); + WilsonFermionD Dw(Umu, Grid, RBGrid, mass, params); Dw.M(src, result); Dwc.M(src, chi); Dwc_prime.M(Omega * src, phi); - WilsonFermionR Dw_prime(U_prime, Grid, RBGrid, mass, params); + WilsonFermionD Dw_prime(U_prime, Grid, RBGrid, mass, params); Dw_prime.M(Omega * src, result2); err = result - adj(Omega) * result2; @@ -411,7 +411,7 @@ int main(int argc, char **argv) chi = Zero(); phi = Zero(); err = Zero(); - WilsonCloverFermionR Dwc_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, anis, params); // <-- Notice: csw=0 + WilsonCloverFermionD Dwc_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, anis, params); // <-- Notice: csw=0 pickCheckerboard(Even, phi_e, phi); pickCheckerboard(Odd, phi_o, phi); @@ -437,7 +437,7 @@ int main(int argc, char **argv) chi = Zero(); phi = Zero(); err = Zero(); - CompactWilsonCloverFermionR Dwc_compact_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, 1.0, anis, params); // <-- Notice: csw=0 + CompactWilsonCloverFermionD Dwc_compact_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, 1.0, anis, params); // <-- Notice: csw=0 pickCheckerboard(Even, phi_e, phi); pickCheckerboard(Odd, phi_o, phi); diff --git a/tests/core/Test_wilson_conserved_current.cc b/tests/core/Test_wilson_conserved_current.cc index 3ee1a271..c66bf940 100644 --- a/tests/core/Test_wilson_conserved_current.cc +++ b/tests/core/Test_wilson_conserved_current.cc @@ -74,7 +74,7 @@ int main (int argc, char ** argv) SU::HotConfiguration(RNG4,Umu); } - typename WilsonCloverFermionR::ImplParams params; + typename WilsonCloverFermionD::ImplParams params; WilsonAnisotropyCoefficients anis; RealD mass = 0.1; RealD csw_r = 1.0; @@ -83,32 +83,32 @@ int main (int argc, char ** argv) std::cout<(Dw,Umu,UGrid,UrbGrid,&RNG4); + WilsonFermionD Dw(Umu,*UGrid,*UrbGrid,mass,params); + TestConserved(Dw,Umu,UGrid,UrbGrid,&RNG4); std::cout<(Dwc,Umu,UGrid,UrbGrid,&RNG4); + WilsonCloverFermionD Dwc(Umu, *UGrid, *UrbGrid, mass, csw_r, csw_t, anis, params); + TestConserved(Dwc,Umu,UGrid,UrbGrid,&RNG4); std::cout<(Dwcc,Umu,UGrid,UrbGrid,&RNG4); + CompactWilsonCloverFermionD Dwcc(Umu, *UGrid, *UrbGrid, mass, csw_r, csw_t, 1.0, anis, params); + TestConserved(Dwcc,Umu,UGrid,UrbGrid,&RNG4); std::cout<(Dewc,Umu,UGrid,UrbGrid,&RNG4); + WilsonExpCloverFermionD Dewc(Umu, *UGrid, *UrbGrid, mass, csw_r, csw_t, anis, params); + TestConserved(Dewc,Umu,UGrid,UrbGrid,&RNG4); std::cout<(Dewcc,Umu,UGrid,UrbGrid,&RNG4); + CompactWilsonExpCloverFermionD Dewcc(Umu, *UGrid, *UrbGrid, mass, csw_r, csw_t, 1.0, anis, params); + TestConserved(Dewcc,Umu,UGrid,UrbGrid,&RNG4); Grid_finalize(); } diff --git a/tests/core/Test_wilson_even_odd.cc b/tests/core/Test_wilson_even_odd.cc index 4d240b80..81081bd0 100644 --- a/tests/core/Test_wilson_even_odd.cc +++ b/tests/core/Test_wilson_even_odd.cc @@ -89,7 +89,7 @@ int main (int argc, char ** argv) RealD mass=0.1; - WilsonFermionR Dw(Umu,Grid,RBGrid,mass); + WilsonFermionD Dw(Umu,Grid,RBGrid,mass); LatticeFermion src_e (&RBGrid); LatticeFermion src_o (&RBGrid); @@ -205,7 +205,7 @@ int main (int argc, char ** argv) pickCheckerboard(Even,phi_e,phi); pickCheckerboard(Odd ,phi_o,phi); - SchurDiagMooeeOperator HermOpEO(Dw); + SchurDiagMooeeOperator HermOpEO(Dw); HermOpEO.MpcDagMpc(chi_e,dchi_e); HermOpEO.MpcDagMpc(chi_o,dchi_o); diff --git a/tests/core/Test_wilson_exp_clover.cc b/tests/core/Test_wilson_exp_clover.cc index 8516d0dc..017d8823 100644 --- a/tests/core/Test_wilson_exp_clover.cc +++ b/tests/core/Test_wilson_exp_clover.cc @@ -52,8 +52,8 @@ int main(int argc, char **argv) pRNG.SeedFixedIntegers(seeds); // pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); - typedef typename WilsonExpCloverFermionR::FermionField FermionField; - typename WilsonExpCloverFermionR::ImplParams params; + typedef typename WilsonExpCloverFermionD::FermionField FermionField; + typename WilsonExpCloverFermionD::ImplParams params; WilsonAnisotropyCoefficients anis; FermionField src(&Grid); @@ -88,8 +88,8 @@ int main(int argc, char **argv) RealD csw_r = 1.0; RealD csw_t = 1.0; - WilsonExpCloverFermionR Dwc(Umu, Grid, RBGrid, mass, csw_r, csw_t, anis, params); - CompactWilsonExpCloverFermionR Dwc_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params); + WilsonExpCloverFermionD Dwc(Umu, Grid, RBGrid, mass, csw_r, csw_t, anis, params); + CompactWilsonExpCloverFermionD Dwc_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params); std::cout << GridLogMessage << "==========================================================" << std::endl; std::cout << GridLogMessage << "= Testing that Deo + Doe = Dunprec " << std::endl; @@ -324,8 +324,8 @@ int main(int argc, char **argv) } ///////////////// - WilsonExpCloverFermionR Dwc_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, anis, params); - CompactWilsonExpCloverFermionR Dwc_compact_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params); + WilsonExpCloverFermionD Dwc_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, anis, params); + CompactWilsonExpCloverFermionD Dwc_compact_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, 1.0, anis, params); tmp = Omega * src; pickCheckerboard(Even, src_e, tmp); @@ -377,14 +377,14 @@ int main(int argc, char **argv) chi = Zero(); phi = Zero(); - WilsonFermionR Dw(Umu, Grid, RBGrid, mass, params); + WilsonFermionD Dw(Umu, Grid, RBGrid, mass, params); Dw.M(src, result); Dwc.M(src, chi); Dwc_prime.M(Omega * src, phi); - WilsonFermionR Dw_prime(U_prime, Grid, RBGrid, mass, params); + WilsonFermionD Dw_prime(U_prime, Grid, RBGrid, mass, params); Dw_prime.M(Omega * src, result2); err = result - adj(Omega) * result2; @@ -411,7 +411,7 @@ int main(int argc, char **argv) chi = Zero(); phi = Zero(); err = Zero(); - WilsonExpCloverFermionR Dwc_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, anis, params); // <-- Notice: csw=0 + WilsonExpCloverFermionD Dwc_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, anis, params); // <-- Notice: csw=0 pickCheckerboard(Even, phi_e, phi); pickCheckerboard(Odd, phi_o, phi); @@ -437,7 +437,7 @@ int main(int argc, char **argv) chi = Zero(); phi = Zero(); err = Zero(); - CompactWilsonExpCloverFermionR Dwc_compact_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, 1.0, anis, params); // <-- Notice: csw=0 + CompactWilsonExpCloverFermionD Dwc_compact_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, 1.0, anis, params); // <-- Notice: csw=0 pickCheckerboard(Even, phi_e, phi); pickCheckerboard(Odd, phi_o, phi); diff --git a/tests/core/Test_wilson_twisted_mass_even_odd.cc b/tests/core/Test_wilson_twisted_mass_even_odd.cc index d9e798c3..0351f7cd 100644 --- a/tests/core/Test_wilson_twisted_mass_even_odd.cc +++ b/tests/core/Test_wilson_twisted_mass_even_odd.cc @@ -90,7 +90,7 @@ int main (int argc, char ** argv) RealD mass=0.1; RealD mu = 0.1; - WilsonTMFermionR Dw(Umu,Grid,RBGrid,mass,mu); + WilsonTMFermionD Dw(Umu,Grid,RBGrid,mass,mu); LatticeFermion src_e (&RBGrid); LatticeFermion src_o (&RBGrid); @@ -206,7 +206,7 @@ int main (int argc, char ** argv) pickCheckerboard(Even,phi_e,phi); pickCheckerboard(Odd ,phi_o,phi); - SchurDiagMooeeOperator HermOpEO(Dw); + SchurDiagMooeeOperator HermOpEO(Dw); HermOpEO.MpcDagMpc(chi_e,dchi_e); HermOpEO.MpcDagMpc(chi_o,dchi_o); diff --git a/tests/core/Test_zmobius_even_odd.cc b/tests/core/Test_zmobius_even_odd.cc index f6e18934..ee6fe860 100644 --- a/tests/core/Test_zmobius_even_odd.cc +++ b/tests/core/Test_zmobius_even_odd.cc @@ -123,7 +123,7 @@ int main (int argc, char ** argv) RealD _mass,RealD _M5, std::vector &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : */ - ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, omegas,RealD(1.),RealD(0.)); + ZMobiusFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, omegas,RealD(1.),RealD(0.)); LatticeFermion src_e (FrbGrid); LatticeFermion src_o (FrbGrid); @@ -278,7 +278,7 @@ int main (int argc, char ** argv) pickCheckerboard(Odd ,phi_o,phi); - SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); HermOpEO.MpcDagMpc(chi_e,dchi_e); HermOpEO.MpcDagMpc(chi_o,dchi_o); diff --git a/tests/forces/Test_contfrac_force.cc b/tests/forces/Test_contfrac_force.cc index dc9eedce..526cde12 100644 --- a/tests/forces/Test_contfrac_force.cc +++ b/tests/forces/Test_contfrac_force.cc @@ -66,7 +66,7 @@ int main (int argc, char ** argv) //////////////////////////////////// RealD mass=0.01; RealD M5=1.8; - OverlapWilsonContFracTanhFermionR Dcf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); + OverlapWilsonContFracTanhFermionD Dcf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); Dcf.M (phi,Mphi); ComplexD S = innerProduct(Mphi,Mphi); // pdag MdagM p diff --git a/tests/forces/Test_dwf_force.cc b/tests/forces/Test_dwf_force.cc index e7d17347..1ae28bb2 100644 --- a/tests/forces/Test_dwf_force.cc +++ b/tests/forces/Test_dwf_force.cc @@ -67,7 +67,7 @@ int main (int argc, char ** argv) //////////////////////////////////// RealD mass=0.01; RealD M5=1.8; - DomainWallFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionD Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); Ddwf.M (phi,Mphi); ComplexD S = innerProduct(Mphi,Mphi); // pdag MdagM p diff --git a/tests/forces/Test_dwf_force_eofa.cc b/tests/forces/Test_dwf_force_eofa.cc index 525178d0..d820573b 100644 --- a/tests/forces/Test_dwf_force_eofa.cc +++ b/tests/forces/Test_dwf_force_eofa.cc @@ -80,8 +80,8 @@ int main (int argc, char** argv) RealD mf = 0.01; RealD mb = 1.0; RealD M5 = 1.8; - DomainWallEOFAFermionR Lop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, 0.0, -1, M5); - DomainWallEOFAFermionR Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5); + DomainWallEOFAFermionD Lop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, 0.0, -1, M5); + DomainWallEOFAFermionD Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5); OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, 12); ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, true); diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc index 9db2c563..72d30369 100644 --- a/tests/forces/Test_dwf_gpforce.cc +++ b/tests/forces/Test_dwf_gpforce.cc @@ -47,7 +47,7 @@ int main (int argc, char ** argv) GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); - typedef typename GparityDomainWallFermionR::FermionField FermionField; + typedef typename GparityDomainWallFermionD::FermionField FermionField; int threads = GridThread::GetThreads(); std::cout< twists(Nd,0); twists[nu] = 1; twists[Nd-1] = 1; //antiperiodic in time - GparityDomainWallFermionR::ImplParams params; + GparityDomainWallFermionD::ImplParams params; params.twists = twists; - GparityDomainWallFermionR Dw(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); + GparityDomainWallFermionD Dw(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); Dw.M (phi,Mphi); diff --git a/tests/forces/Test_dwf_gpforce_eofa.cc b/tests/forces/Test_dwf_gpforce_eofa.cc index fd47d33c..08923faa 100644 --- a/tests/forces/Test_dwf_gpforce_eofa.cc +++ b/tests/forces/Test_dwf_gpforce_eofa.cc @@ -33,7 +33,7 @@ using namespace std; using namespace Grid; typedef GparityWilsonImplR FermionImplPolicy; -typedef GparityDomainWallEOFAFermionR FermionAction; +typedef GparityDomainWallEOFAFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; int main (int argc, char** argv) diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc index af1ce82b..58dbfc47 100644 --- a/tests/forces/Test_gpdwf_force.cc +++ b/tests/forces/Test_gpdwf_force.cc @@ -56,7 +56,7 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); std::cout< twists(Nd,0); twists[nu] = 1; twists[3] = 1; - GparityDomainWallFermionR::ImplParams params; params.twists = twists; - GparityDomainWallFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); + GparityDomainWallFermionD::ImplParams params; params.twists = twists; + GparityDomainWallFermionD Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); Ddwf.M (phi,Mphi); ComplexD S = innerProduct(Mphi,Mphi); // pdag MdagM p diff --git a/tests/forces/Test_gpdwf_force_1f_2f.cc b/tests/forces/Test_gpdwf_force_1f_2f.cc index 7e14eb08..c343b7ac 100644 --- a/tests/forces/Test_gpdwf_force_1f_2f.cc +++ b/tests/forces/Test_gpdwf_force_1f_2f.cc @@ -70,8 +70,8 @@ void convertFermion1f_from_2f(FermionField1f &out_1f, const FermionField2f &in_2 int nuoff = is_4d ? 0 : 1; //s in 0 direction - int L_2f = FGrid_2f->FullDimensions()[nu+nuoff]; - int L_1f = FGrid_1f->FullDimensions()[nu+nuoff]; + Integer L_2f = FGrid_2f->FullDimensions()[nu+nuoff]; + Integer L_1f = FGrid_1f->FullDimensions()[nu+nuoff]; assert(L_1f == 2 * L_2f); auto in_f0_2fgrid = PeekIndex(in_2f,0); //flavor 0 on 2f Grid diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc index 7ab2ddeb..4c3380fe 100644 --- a/tests/forces/Test_gpwilson_force.cc +++ b/tests/forces/Test_gpwilson_force.cc @@ -50,7 +50,7 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); std::cout< twists(Nd,0); twists[nu] = 1; twists[3]=1; - GparityWilsonFermionR::ImplParams params; params.twists = twists; - GparityWilsonFermionR Wil(U,*UGrid,*UrbGrid,mass,params); + GparityWilsonFermionD::ImplParams params; params.twists = twists; + GparityWilsonFermionD Wil(U,*UGrid,*UrbGrid,mass,params); Wil.M (phi,Mphi); ComplexD S = innerProduct(Mphi,Mphi); // pdag MdagM p diff --git a/tests/forces/Test_mobius_force.cc b/tests/forces/Test_mobius_force.cc index d2326a81..3518007c 100644 --- a/tests/forces/Test_mobius_force.cc +++ b/tests/forces/Test_mobius_force.cc @@ -76,7 +76,7 @@ int main (int argc, char ** argv) p.boundary_phases[2] = 1.0; p.boundary_phases[3] =- 1.0; - MobiusFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,p); + MobiusFermionD Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,p); Ddwf.M (phi,Mphi); ComplexD S = innerProduct(Mphi,Mphi); // pdag MdagM p diff --git a/tests/forces/Test_mobius_force_eofa.cc b/tests/forces/Test_mobius_force_eofa.cc index 1d25771a..a8871faa 100644 --- a/tests/forces/Test_mobius_force_eofa.cc +++ b/tests/forces/Test_mobius_force_eofa.cc @@ -82,8 +82,8 @@ int main (int argc, char** argv) RealD mf = 0.01; RealD mb = 1.0; RealD M5 = 1.8; - MobiusEOFAFermionR Lop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, 0.0, -1, M5, b, c); - MobiusEOFAFermionR Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5, b, c); + MobiusEOFAFermionD Lop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5, b, c); OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, 12); ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false); diff --git a/tests/forces/Test_mobius_gpforce_eofa.cc b/tests/forces/Test_mobius_gpforce_eofa.cc index 7f114615..dd71b565 100644 --- a/tests/forces/Test_mobius_gpforce_eofa.cc +++ b/tests/forces/Test_mobius_gpforce_eofa.cc @@ -34,7 +34,7 @@ using namespace Grid; ; typedef GparityWilsonImplR FermionImplPolicy; -typedef GparityMobiusEOFAFermionR FermionAction; +typedef GparityMobiusEOFAFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; int main (int argc, char** argv) diff --git a/tests/forces/Test_partfrac_force.cc b/tests/forces/Test_partfrac_force.cc index 33f7b5fd..173f7626 100644 --- a/tests/forces/Test_partfrac_force.cc +++ b/tests/forces/Test_partfrac_force.cc @@ -69,7 +69,7 @@ int main (int argc, char ** argv) //////////////////////////////////// RealD mass=0.01; RealD M5=1.8; - OverlapWilsonPartialFractionTanhFermionR Dpf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); + OverlapWilsonPartialFractionTanhFermionD Dpf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); Dpf.M (phi,Mphi); ComplexD S = innerProduct(Mphi,Mphi); // pdag MdagM p diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc index b7bf1268..f4bf8ed3 100644 --- a/tests/forces/Test_wilson_force.cc +++ b/tests/forces/Test_wilson_force.cc @@ -67,7 +67,7 @@ int main (int argc, char ** argv) // Unmodified matrix element //////////////////////////////////// RealD mass=-4.0; //kills the diagonal term - WilsonFermionR Dw (U, Grid,RBGrid,mass); + WilsonFermionD Dw (U, Grid,RBGrid,mass); Dw.M (phi,Mphi); ComplexD S = innerProduct(Mphi,Mphi); // pdag MdagM p diff --git a/tests/forces/Test_wilsonclover_force.cc b/tests/forces/Test_wilsonclover_force.cc index 6a28e4e2..8aa5eb9d 100644 --- a/tests/forces/Test_wilsonclover_force.cc +++ b/tests/forces/Test_wilsonclover_force.cc @@ -70,7 +70,7 @@ int main(int argc, char **argv) //////////////////////////////////// RealD mass = 0.1; Real csw = 1.0; - WilsonCloverFermionR Dw(U, Grid, RBGrid, mass, csw, csw); + WilsonCloverFermionD Dw(U, Grid, RBGrid, mass, csw, csw); Dw.ImportGauge(U); Dw.M(phi, Mphi); ComplexD S = innerProduct(Mphi, Mphi); // Action : pdag MdagM p diff --git a/tests/forces/Test_zmobius_force.cc b/tests/forces/Test_zmobius_force.cc index 89673bc7..5d3a86f4 100644 --- a/tests/forces/Test_zmobius_force.cc +++ b/tests/forces/Test_zmobius_force.cc @@ -81,7 +81,7 @@ int main (int argc, char ** argv) omegas.push_back( std::complex(0.0686324988446592,0.0550658530827402) ); omegas.push_back( std::complex(0.0686324988446592,-0.0550658530827402) ); - ZMobiusFermionR Ddwf(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, omegas,b,c); + ZMobiusFermionD Ddwf(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, omegas,b,c); Ddwf.M (phi,Mphi); diff --git a/tests/hmc/Test_action_dwf_gparity2fvs1f.cc b/tests/hmc/Test_action_dwf_gparity2fvs1f.cc index 830bcead..46f87d93 100644 --- a/tests/hmc/Test_action_dwf_gparity2fvs1f.cc +++ b/tests/hmc/Test_action_dwf_gparity2fvs1f.cc @@ -59,7 +59,7 @@ void copy2fTo1fFermionField(FermionField1f &out, const FermionField2f &in, int g LatticeInteger xcoor_1f(out.Grid()); //5d lattice integer LatticeCoordinate(xcoor_1f,gpdir); - int L = dim_2f[gpdir]; + Integer L = dim_2f[gpdir]; out = where(xcoor_1f < L, f0_fullgrid_dbl, f1_fullgrid_dbl); } @@ -76,7 +76,7 @@ void copy2fTo1fGaugeField(LatticeGaugeField &out, const LatticeGaugeField &in, i LatticeInteger xcoor_1f(out.Grid()); LatticeCoordinate(xcoor_1f,gpdir); - int L = dim_2f[gpdir]; + Integer L = dim_2f[gpdir]; out = where(xcoor_1f < L, U_dbl, Uconj_dbl); } @@ -140,11 +140,11 @@ int main(int argc, char **argv) { copy2fTo1fGaugeField(Umu_1f, Umu_2f, mu); typedef GparityWilsonImplR FermionImplPolicy2f; - typedef GparityDomainWallFermionR FermionAction2f; + typedef GparityDomainWallFermionD FermionAction2f; typedef typename FermionAction2f::FermionField FermionField2f; typedef WilsonImplR FermionImplPolicy1f; - typedef DomainWallFermionR FermionAction1f; + typedef DomainWallFermionD FermionAction1f; typedef typename FermionAction1f::FermionField FermionField1f; std::cout << "Generating eta 2f" << std::endl; diff --git a/tests/hmc/Test_hmc_EODWFRatio.cc b/tests/hmc/Test_hmc_EODWFRatio.cc index 93469ffe..ff8521cb 100644 --- a/tests/hmc/Test_hmc_EODWFRatio.cc +++ b/tests/hmc/Test_hmc_EODWFRatio.cc @@ -43,7 +43,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef DomainWallFermionR FermionAction; + typedef DomainWallFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; @@ -136,16 +136,9 @@ int main(int argc, char **argv) { TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file // Reset performance counters - NumOp.ZeroCounters(); - DenOp.ZeroCounters(); TheHMC.Run(); // no smearing // TheHMC.Run(SmearingPolicy); // for smearing - std::cout << GridLogMessage << "Numerator report, Pauli-Villars term : " << std::endl; - NumOp.Report(); - std::cout << GridLogMessage << "Denominator report, Dw(m) term (includes CG) : " << std::endl; - DenOp.Report(); - Grid_finalize(); } // main diff --git a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc index 9ca0b0a0..f98d0edc 100644 --- a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc +++ b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc @@ -42,7 +42,7 @@ int main(int argc, char **argv) { typedef ConjugateHMCRunner HMCWrapper; // Uses the default minimum norm typedef GparityWilsonImplR FermionImplPolicy; - typedef GparityDomainWallFermionR FermionAction; + typedef GparityDomainWallFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; @@ -132,15 +132,9 @@ int main(int argc, char **argv) { TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file // Reset performance counters - NumOp.ZeroCounters(); - DenOp.ZeroCounters(); TheHMC.Run(); // no smearing // TheHMC.Run(SmearingPolicy); // for smearing - std::cout << GridLogMessage << "Numerator report, Pauli-Villars term : " << std::endl; - NumOp.Report(); - std::cout << GridLogMessage << "Denominator report, Dw(m) term (includes CG) : " << std::endl; - DenOp.Report(); Grid_finalize(); diff --git a/tests/hmc/Test_hmc_EOMobiusRatio.cc b/tests/hmc/Test_hmc_EOMobiusRatio.cc index 63b4d4fa..0e0a6611 100644 --- a/tests/hmc/Test_hmc_EOMobiusRatio.cc +++ b/tests/hmc/Test_hmc_EOMobiusRatio.cc @@ -83,7 +83,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef MobiusFermionR FermionAction; + typedef MobiusFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; // Serialiser typedef Grid::XmlReader Serialiser; @@ -211,8 +211,6 @@ int main(int argc, char **argv) { */ // Reset performance counters - NumOp.ZeroCounters(); - DenOp.ZeroCounters(); if (ApplySmearing){ SmearingParameters SmPar(Reader); @@ -225,11 +223,6 @@ int main(int argc, char **argv) { TheHMC.Run(); // no smearing } - std::cout << GridLogMessage << "Numerator report, Pauli-Villars term : " << std::endl; - NumOp.Report(); - std::cout << GridLogMessage << "Denominator report, Dw(m) term (includes CG) : " << std::endl; - DenOp.Report(); - Grid_finalize(); } // main diff --git a/tests/hmc/Test_hmc_EOMobiusRatioManyFlavour.cc b/tests/hmc/Test_hmc_EOMobiusRatioManyFlavour.cc index 790433f2..3f29ae62 100644 --- a/tests/hmc/Test_hmc_EOMobiusRatioManyFlavour.cc +++ b/tests/hmc/Test_hmc_EOMobiusRatioManyFlavour.cc @@ -89,7 +89,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef MobiusFermionR FermionAction; + typedef MobiusFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; // Serialiser typedef Grid::XmlReader Serialiser; @@ -226,8 +226,6 @@ int main(int argc, char **argv) { */ // Reset performance counters - NumOp.ZeroCounters(); - DenOp.ZeroCounters(); if (ApplySmearing){ SmearingParameters SmPar(Reader); @@ -240,10 +238,6 @@ int main(int argc, char **argv) { TheHMC.Run(); // no smearing } - std::cout << GridLogMessage << "Numerator report, Pauli-Villars term : " << std::endl; - NumOp.Report(); - std::cout << GridLogMessage << "Denominator report, Dw(m) term (includes CG) : " << std::endl; - DenOp.Report(); Grid_finalize(); } // main diff --git a/tests/hmc/Test_hmc_EOWilsonCloverFermionGauge.cc b/tests/hmc/Test_hmc_EOWilsonCloverFermionGauge.cc index 6b9b70b5..f6485f20 100644 --- a/tests/hmc/Test_hmc_EOWilsonCloverFermionGauge.cc +++ b/tests/hmc/Test_hmc_EOWilsonCloverFermionGauge.cc @@ -39,7 +39,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonCloverFermionR FermionAction; + typedef WilsonCloverFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_hmc_EOWilsonFermionGauge.cc b/tests/hmc/Test_hmc_EOWilsonFermionGauge.cc index 3643c0ad..092e66d1 100644 --- a/tests/hmc/Test_hmc_EOWilsonFermionGauge.cc +++ b/tests/hmc/Test_hmc_EOWilsonFermionGauge.cc @@ -40,7 +40,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonFermionR FermionAction; + typedef WilsonFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_hmc_EOWilsonRatio.cc b/tests/hmc/Test_hmc_EOWilsonRatio.cc index 675bc605..406aa34d 100644 --- a/tests/hmc/Test_hmc_EOWilsonRatio.cc +++ b/tests/hmc/Test_hmc_EOWilsonRatio.cc @@ -42,7 +42,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonFermionR FermionAction; + typedef WilsonFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc index d4bfa0a5..d79404a0 100644 --- a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc +++ b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) { typedef ConjugateHMCRunner HMCWrapper; // Uses the default minimum norm typedef GparityWilsonImplR FermionImplPolicy; - typedef GparityDomainWallFermionR FermionAction; + typedef GparityDomainWallFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_hmc_GparityWilsonGauge.cc b/tests/hmc/Test_hmc_GparityWilsonGauge.cc index b8c078fe..76901235 100644 --- a/tests/hmc/Test_hmc_GparityWilsonGauge.cc +++ b/tests/hmc/Test_hmc_GparityWilsonGauge.cc @@ -42,7 +42,7 @@ int main(int argc, char **argv) { typedef ConjugateHMCRunner HMCWrapper; // Uses the default minimum norm typedef GparityWilsonImplR FermionImplPolicy; - typedef GparityDomainWallFermionR FermionAction; + typedef GparityDomainWallFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_hmc_Mobius2p1f.cc b/tests/hmc/Test_hmc_Mobius2p1f.cc index 508f5b5e..8c97fbb5 100644 --- a/tests/hmc/Test_hmc_Mobius2p1f.cc +++ b/tests/hmc/Test_hmc_Mobius2p1f.cc @@ -39,7 +39,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef WilsonImplR FermionImplPolicy; - typedef MobiusFermionR FermionAction; + typedef MobiusFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; typedef Grid::XmlReader Serialiser; @@ -148,14 +148,14 @@ int main(int argc, char **argv) { // Level1.push_back(&StrangePseudoFermion); // DJM: setup for EOFA ratio (Shamir) - // DomainWallEOFAFermionR Strange_Op_L(U, *FGrid, *FrbGrid, *GridPtr, *GridRBPtr, strange_mass, strange_mass, pv_mass, 0.0, -1, M5); - // DomainWallEOFAFermionR Strange_Op_R(U, *FGrid, *FrbGrid, *GridPtr, *GridRBPtr, pv_mass, strange_mass, pv_mass, -1.0, 1, M5); + // DomainWallEOFAFermionD Strange_Op_L(U, *FGrid, *FrbGrid, *GridPtr, *GridRBPtr, strange_mass, strange_mass, pv_mass, 0.0, -1, M5); + // DomainWallEOFAFermionD Strange_Op_R(U, *FGrid, *FrbGrid, *GridPtr, *GridRBPtr, pv_mass, strange_mass, pv_mass, -1.0, 1, M5); // ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L, Strange_Op_R, CG, OFRp, true); // Level1.push_back(&EOFA); // DJM: setup for EOFA ratio (Mobius) - MobiusEOFAFermionR Strange_Op_L(U, *FGrid, *FrbGrid, *GridPtr, *GridRBPtr, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); - MobiusEOFAFermionR Strange_Op_R(U, *FGrid, *FrbGrid, *GridPtr, *GridRBPtr, pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + MobiusEOFAFermionD Strange_Op_L(U, *FGrid, *FrbGrid, *GridPtr, *GridRBPtr, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R(U, *FGrid, *FrbGrid, *GridPtr, *GridRBPtr, pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L, Strange_Op_R, CG, OFRp, true); Level1.push_back(&EOFA); diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc index 726ecd4a..119a39dc 100644 --- a/tests/hmc/Test_hmc_ScalarActionNxN.cc +++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc @@ -34,7 +34,7 @@ class ScalarActionParameters : Serializable { double, lambda, double, g); - ScalarActionParameters() = default; + ScalarActionParameters() {}; template ScalarActionParameters(Reader& Reader){ @@ -45,7 +45,6 @@ class ScalarActionParameters : Serializable { } using namespace Grid; - ; template class MagMeas : public HmcObservable { diff --git a/tests/hmc/Test_hmc_WC2ASFG_Production.cc b/tests/hmc/Test_hmc_WC2ASFG_Production.cc index 0bbf4ece..55c84c0a 100644 --- a/tests/hmc/Test_hmc_WC2ASFG_Production.cc +++ b/tests/hmc/Test_hmc_WC2ASFG_Production.cc @@ -80,7 +80,7 @@ int main(int argc, char **argv) // Typedefs to simplify notation typedef GenericHMCRunnerHirep HMCWrapper; // Uses the default minimum norm typedef WilsonTwoIndexAntiSymmetricImplR FermionImplPolicy; // gauge field implemetation for the pseudofermions - typedef WilsonCloverTwoIndexAntiSymmetricFermionR FermionAction; // type of lattice fermions (Wilson, DW, ...) + typedef WilsonCloverTwoIndexAntiSymmetricFermionD FermionAction; // type of lattice fermions (Wilson, DW, ...) typedef typename FermionAction::FermionField FermionField; //typedef Grid::JSONReader Serialiser; typedef Grid::XmlReader Serialiser; diff --git a/tests/hmc/Test_hmc_WC2SFG_Production.cc b/tests/hmc/Test_hmc_WC2SFG_Production.cc index 64a3f1cb..80f7baaf 100644 --- a/tests/hmc/Test_hmc_WC2SFG_Production.cc +++ b/tests/hmc/Test_hmc_WC2SFG_Production.cc @@ -81,7 +81,7 @@ int main(int argc, char **argv) // Typedefs to simplify notation typedef GenericHMCRunnerHirep HMCWrapper; // Uses the default minimum norm typedef WilsonTwoIndexSymmetricImplR FermionImplPolicy; // gauge field implemetation for the pseudofermions - typedef WilsonCloverTwoIndexSymmetricFermionR FermionAction; // type of lattice fermions (Wilson, DW, ...) + typedef WilsonCloverTwoIndexSymmetricFermionD FermionAction; // type of lattice fermions (Wilson, DW, ...) typedef typename FermionAction::FermionField FermionField; //typedef Grid::JSONReader Serialiser; typedef Grid::XmlReader Serialiser; diff --git a/tests/hmc/Test_hmc_WCFG_Production.cc b/tests/hmc/Test_hmc_WCFG_Production.cc index cebe3791..bd02886d 100644 --- a/tests/hmc/Test_hmc_WCFG_Production.cc +++ b/tests/hmc/Test_hmc_WCFG_Production.cc @@ -79,7 +79,7 @@ int main(int argc, char **argv) // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonCloverFermionR FermionAction; + typedef WilsonCloverFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; typedef Grid::XmlReader Serialiser; diff --git a/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc b/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc index 211900be..ac017860 100644 --- a/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc +++ b/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc @@ -84,11 +84,11 @@ int main(int argc, char **argv) { typedef GenericHMCRunnerHirep HMCWrapper; typedef WilsonImplR FundImplPolicy; - typedef WilsonCloverFermionR FundFermionAction; + typedef WilsonCloverFermionD FundFermionAction; typedef typename FundFermionAction::FermionField FundFermionField; typedef WilsonTwoIndexAntiSymmetricImplR ASymmImplPolicy; - typedef WilsonCloverTwoIndexAntiSymmetricFermionR ASymmFermionAction; + typedef WilsonCloverTwoIndexAntiSymmetricFermionD ASymmFermionAction; typedef typename ASymmFermionAction::FermionField ASymmFermionField; typedef Grid::XmlReader Serialiser; diff --git a/tests/hmc/Test_hmc_WCadjFG_Production.cc b/tests/hmc/Test_hmc_WCadjFG_Production.cc index 5cf4bac1..a9673e6c 100644 --- a/tests/hmc/Test_hmc_WCadjFG_Production.cc +++ b/tests/hmc/Test_hmc_WCadjFG_Production.cc @@ -81,7 +81,7 @@ int main(int argc, char **argv) // Typedefs to simplify notation typedef GenericHMCRunnerHirep HMCWrapper; // Uses the default minimum norm typedef WilsonAdjImplR FermionImplPolicy; // gauge field implemetation for the pseudofermions - typedef WilsonCloverAdjFermionR FermionAction; // type of lattice fermions (Wilson, DW, ...) + typedef WilsonCloverAdjFermionD FermionAction; // type of lattice fermions (Wilson, DW, ...) typedef typename FermionAction::FermionField FermionField; typedef Grid::XmlReader Serialiser; diff --git a/tests/hmc/Test_hmc_WilsonAdjointFermionGauge.cc b/tests/hmc/Test_hmc_WilsonAdjointFermionGauge.cc index cc56cae3..774c9037 100644 --- a/tests/hmc/Test_hmc_WilsonAdjointFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonAdjointFermionGauge.cc @@ -46,7 +46,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunnerHirep HMCWrapper; typedef WilsonAdjImplR FermionImplPolicy; // gauge field implemetation for the pseudofermions - typedef WilsonAdjFermionR FermionAction; // type of lattice fermions (Wilson, DW, ...) + typedef WilsonAdjFermionD FermionAction; // type of lattice fermions (Wilson, DW, ...) typedef typename FermionAction::FermionField FermionField; //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/tests/hmc/Test_hmc_WilsonCloverFermionGauge.cc b/tests/hmc/Test_hmc_WilsonCloverFermionGauge.cc index 9e27e3ec..149e6c5c 100644 --- a/tests/hmc/Test_hmc_WilsonCloverFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonCloverFermionGauge.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonCloverFermionR FermionAction; + typedef WilsonCloverFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/tests/hmc/Test_hmc_WilsonFermionGauge.cc b/tests/hmc/Test_hmc_WilsonFermionGauge.cc index cc1f2474..a0c43c51 100644 --- a/tests/hmc/Test_hmc_WilsonFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonFermionGauge.cc @@ -42,7 +42,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonFermionR FermionAction; + typedef WilsonFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_hmc_WilsonGauge.cc b/tests/hmc/Test_hmc_WilsonGauge.cc index 4ef0e658..28feadf3 100644 --- a/tests/hmc/Test_hmc_WilsonGauge.cc +++ b/tests/hmc/Test_hmc_WilsonGauge.cc @@ -69,8 +69,10 @@ int main(int argc, char **argv) TopologyObsParameters TopParams; TopParams.interval = 5; TopParams.do_smearing = true; - TopParams.Smearing.steps = 200; - TopParams.Smearing.step_size = 0.01; + TopParams.Smearing.init_step_size = 0.01; + TopParams.Smearing.tolerance = 1e-5; + // TopParams.Smearing.steps = 200; + // TopParams.Smearing.step_size = 0.01; TopParams.Smearing.meas_interval = 50; TopParams.Smearing.maxTau = 2.0; TheHMC.Resources.AddObservable(TopParams); diff --git a/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc b/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc index 3b8cdda6..60d760a6 100644 --- a/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc @@ -51,9 +51,9 @@ int main(int argc, char **argv) { typedef GenericHMCRunnerHirep HMCWrapper; typedef WilsonAdjImplR AdjImplPolicy; // gauge field implemetation for the pseudofermions - typedef WilsonAdjFermionR AdjFermionAction; // type of lattice fermions (Wilson, DW, ...) + typedef WilsonAdjFermionD AdjFermionAction; // type of lattice fermions (Wilson, DW, ...) typedef WilsonTwoIndexSymmetricImplR SymmImplPolicy; - typedef WilsonTwoIndexSymmetricFermionR SymmFermionAction; + typedef WilsonTwoIndexSymmetricFermionD SymmFermionAction; typedef typename AdjFermionAction::FermionField AdjFermionField; diff --git a/tests/hmc/Test_hmc_WilsonRatio.cc b/tests/hmc/Test_hmc_WilsonRatio.cc index 3e3cac7e..e134dd83 100644 --- a/tests/hmc/Test_hmc_WilsonRatio.cc +++ b/tests/hmc/Test_hmc_WilsonRatio.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonFermionR FermionAction; + typedef WilsonFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_hmc_WilsonTMFermionGauge.cc b/tests/hmc/Test_hmc_WilsonTMFermionGauge.cc index 5928efbe..9916580e 100644 --- a/tests/hmc/Test_hmc_WilsonTMFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonTMFermionGauge.cc @@ -42,7 +42,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonTMFermionR FermionAction; + typedef WilsonTMFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_hmc_WilsonTwoIndexSymmetricFermionGauge.cc b/tests/hmc/Test_hmc_WilsonTwoIndexSymmetricFermionGauge.cc index 387842f7..bdcd27c5 100644 --- a/tests/hmc/Test_hmc_WilsonTwoIndexSymmetricFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonTwoIndexSymmetricFermionGauge.cc @@ -45,7 +45,7 @@ int main(int argc, char **argv) { typedef GenericHMCRunnerHirep HMCWrapper; typedef WilsonTwoIndexSymmetricImplR FermionImplPolicy; // gauge field implemetation for the pseudofermions - typedef WilsonTwoIndexSymmetricFermionR FermionAction; // type of lattice fermions (Wilson, DW, ...) + typedef WilsonTwoIndexSymmetricFermionD FermionAction; // type of lattice fermions (Wilson, DW, ...) typedef typename FermionAction::FermionField FermionField; //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/tests/hmc/Test_rhmc_EOWilson1p1.cc b/tests/hmc/Test_rhmc_EOWilson1p1.cc index 51a966b1..1e0975ca 100644 --- a/tests/hmc/Test_rhmc_EOWilson1p1.cc +++ b/tests/hmc/Test_rhmc_EOWilson1p1.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonFermionR FermionAction; + typedef WilsonFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_rhmc_EOWilsonRatio.cc b/tests/hmc/Test_rhmc_EOWilsonRatio.cc index 44fb6d47..06d54215 100644 --- a/tests/hmc/Test_rhmc_EOWilsonRatio.cc +++ b/tests/hmc/Test_rhmc_EOWilsonRatio.cc @@ -42,7 +42,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonFermionR FermionAction; + typedef WilsonFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_rhmc_Wilson1p1.cc b/tests/hmc/Test_rhmc_Wilson1p1.cc index 93b748d2..2935092c 100644 --- a/tests/hmc/Test_rhmc_Wilson1p1.cc +++ b/tests/hmc/Test_rhmc_Wilson1p1.cc @@ -42,7 +42,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonFermionR FermionAction; + typedef WilsonFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; diff --git a/tests/hmc/Test_rhmc_WilsonRatio.cc b/tests/hmc/Test_rhmc_WilsonRatio.cc index 4896d329..60bb7641 100644 --- a/tests/hmc/Test_rhmc_WilsonRatio.cc +++ b/tests/hmc/Test_rhmc_WilsonRatio.cc @@ -41,7 +41,7 @@ int main(int argc, char **argv) { // Typedefs to simplify notation typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm typedef WilsonImplR FermionImplPolicy; - typedef WilsonFermionR FermionAction; + typedef WilsonFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; From 3dbfce522320128e26fcc3fc7223e33fe221d964 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Nov 2022 20:15:51 -0500 Subject: [PATCH 189/240] Tests clean build on HIP --- Grid/algorithms/CoarsenedMatrix.h | 22 ++--- Grid/algorithms/LinearOperator.h | 1 + Grid/qcd/action/ActionParams.h | 9 -- Grid/stencil/Stencil.h | 10 ++ Grid/tensors/Tensor_traits.h | 2 +- systems/Crusher/config-command | 4 +- systems/Crusher/sourceme.sh | 2 + tests/debug/Test_cayley_cg.cc | 48 ++++----- tests/debug/Test_cayley_coarsen_support.cc | 6 +- tests/debug/Test_cayley_even_odd.cc | 32 +++--- tests/debug/Test_cayley_ldop_cr.cc | 6 +- tests/debug/Test_cayley_mres.cc | 28 +++--- tests/debug/Test_heatbath_dwf_eofa.cc | 4 +- tests/debug/Test_heatbath_dwf_eofa_gparity.cc | 4 +- tests/debug/Test_heatbath_mobius_eofa.cc | 4 +- .../Test_heatbath_mobius_eofa_gparity.cc | 2 +- tests/debug/Test_reweight_dwf_eofa.cc | 16 +-- tests/debug/Test_reweight_dwf_eofa_gparity.cc | 20 ++-- tests/debug/Test_reweight_mobius_eofa.cc | 16 +-- .../Test_reweight_mobius_eofa_gparity.cc | 20 ++-- tests/hmc/Test_hmc_WC2ASFG_Production.cc | 6 +- tests/hmc/Test_hmc_WC2SFG_Production.cc | 4 + tests/hmc/Test_hmc_WCMixedRepFG_Production.cc | 4 + tests/hmc/Test_hmc_WCadjFG_Production.cc | 4 + .../hmc/Test_hmc_WilsonAdjointFermionGauge.cc | 6 +- ..._WilsonMixedRepresentationsFermionGauge.cc | 4 + ...hmc_WilsonTwoIndexSymmetricFermionGauge.cc | 4 + tests/lanczos/Test_WCMultiRep_lanczos.cc | 13 ++- tests/lanczos/Test_compressed_lanczos.cc | 4 +- .../Test_dwf_compressed_lanczos_reorg.cc | 4 +- ..._dwf_compressed_lanczos_reorg_synthetic.cc | 4 +- tests/lanczos/Test_dwf_lanczos.cc | 16 +-- tests/lanczos/Test_wilson_lanczos.cc | 4 +- tests/smearing/Test_WilsonFlow.cc | 9 +- tests/solver/Test_cf_cr_unprec.cc | 6 +- tests/solver/Test_coarse_even_odd.cc | 4 +- tests/solver/Test_contfrac_cg.cc | 16 +-- tests/solver/Test_dwf_cg_prec.cc | 7 +- tests/solver/Test_dwf_cg_schur.cc | 2 +- tests/solver/Test_dwf_cg_unprec.cc | 4 +- tests/solver/Test_dwf_cr_unprec.cc | 6 +- tests/solver/Test_dwf_fpgcr.cc | 6 +- tests/solver/Test_dwf_hdcr.cc | 14 +-- tests/solver/Test_dwf_hdcr_16_rb.cc | 14 +-- tests/solver/Test_dwf_hdcr_24_regression.cc | 18 ++-- tests/solver/Test_dwf_hdcr_2level.cc | 30 +++--- tests/solver/Test_dwf_hdcr_48_rb.cc | 14 +-- tests/solver/Test_dwf_hdcr_48_regression.cc | 18 ++-- tests/solver/Test_dwf_mrhs_cg.cc | 14 +-- tests/solver/Test_dwf_mrhs_cg_mpi.cc | 14 +-- tests/solver/Test_dwf_mrhs_cg_mpieo.cc | 14 +-- tests/solver/Test_dwf_multigrid.cc | 20 ++-- tests/solver/Test_dwf_qmr_unprec.cc | 8 +- tests/solver/Test_eofa_inv.cc | 4 +- tests/solver/Test_hw_multigrid.cc | 10 +- tests/solver/Test_hw_multigrid_mixed_48.cc | 98 +++++++++---------- tests/solver/Test_hw_multigrid_mixed_48_rb.cc | 96 +++++++++--------- tests/solver/Test_mobius_bcg.cc | 14 +-- tests/solver/Test_mobius_bcg_nosplit.cc | 10 +- tests/solver/Test_mobius_bcg_phys_nosplit.cc | 10 +- tests/solver/Test_mobius_bcg_prec_nosplit.cc | 10 +- tests/solver/Test_split_grid.cc | 14 +-- tests/solver/Test_staggered_block_cg_prec.cc | 18 ++-- .../solver/Test_staggered_block_cg_unprec.cc | 26 ++--- tests/solver/Test_staggered_cagmres_unprec.cc | 10 +- tests/solver/Test_staggered_cg_prec.cc | 8 +- tests/solver/Test_staggered_cg_schur.cc | 6 +- tests/solver/Test_staggered_cg_unprec.cc | 10 +- tests/solver/Test_staggered_fcagmres_prec.cc | 10 +- tests/solver/Test_staggered_fgmres_prec.cc | 10 +- tests/solver/Test_staggered_gmres_unprec.cc | 10 +- tests/solver/Test_staggered_mr_unprec.cc | 10 +- tests/solver/Test_staggered_multishift.cc | 8 +- tests/solver/Test_wilson_cagmres_unprec.cc | 4 +- tests/solver/Test_wilson_cg_prec.cc | 4 +- tests/solver/Test_wilson_cg_schur.cc | 2 +- tests/solver/Test_wilson_cg_unprec.cc | 4 +- tests/solver/Test_wilson_cr_unprec.cc | 4 +- tests/solver/Test_wilson_fcagmres_prec.cc | 4 +- tests/solver/Test_wilson_fgmres_prec.cc | 4 +- tests/solver/Test_wilson_gmres_unprec.cc | 4 +- tests/solver/Test_wilson_mg.cc | 6 +- tests/solver/Test_wilson_mr_unprec.cc | 4 +- tests/solver/Test_wilson_qmr_unprec.cc | 4 +- .../solver/Test_wilsonclover_bicgstab_prec.cc | 4 +- .../Test_wilsonclover_bicgstab_schur.cc | 2 +- .../Test_wilsonclover_bicgstab_unprec.cc | 4 +- .../Test_wilsonclover_cagmres_unprec.cc | 8 +- tests/solver/Test_wilsonclover_cg_prec.cc | 16 +-- tests/solver/Test_wilsonclover_cg_schur.cc | 8 +- tests/solver/Test_wilsonclover_cg_unprec.cc | 16 +-- .../solver/Test_wilsonclover_fcagmres_prec.cc | 8 +- tests/solver/Test_wilsonclover_fgmres_prec.cc | 8 +- .../solver/Test_wilsonclover_gmres_unprec.cc | 8 +- tests/solver/Test_wilsonclover_mg.cc | 6 +- tests/solver/Test_wilsonclover_mr_unprec.cc | 8 +- tests/solver/Test_zmobius_cg_prec.cc | 5 +- 97 files changed, 564 insertions(+), 544 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index ba4abecd..7008008c 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -324,9 +324,9 @@ public: GridBase* _cbgrid; int hermitian; - CartesianStencil Stencil; - CartesianStencil StencilEven; - CartesianStencil StencilOdd; + CartesianStencil Stencil; + CartesianStencil StencilEven; + CartesianStencil StencilOdd; std::vector A; std::vector Aeven; @@ -631,7 +631,7 @@ public: assert(Aself != nullptr); } - void DselfInternal(CartesianStencil &st, CoarseMatrix &a, + void DselfInternal(CartesianStencil &st, CoarseMatrix &a, const CoarseVector &in, CoarseVector &out, int dag) { int point = geom.npoint-1; autoView( out_v, out, AcceleratorWrite); @@ -694,7 +694,7 @@ public: } } - void DhopInternal(CartesianStencil &st, std::vector &a, + void DhopInternal(CartesianStencil &st, std::vector &a, const CoarseVector &in, CoarseVector &out, int dag) { SimpleCompressor compressor; @@ -784,9 +784,9 @@ public: _cbgrid(new GridRedBlackCartesian(&CoarseGrid)), geom(CoarseGrid._ndimension), hermitian(hermitian_), - Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0), + Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements), A(geom.npoint,&CoarseGrid), Aeven(geom.npoint,_cbgrid), Aodd(geom.npoint,_cbgrid), @@ -804,9 +804,9 @@ public: _cbgrid(&CoarseRBGrid), geom(CoarseGrid._ndimension), hermitian(hermitian_), - Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0), + Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements), A(geom.npoint,&CoarseGrid), Aeven(geom.npoint,&CoarseRBGrid), Aodd(geom.npoint,&CoarseRBGrid), diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h index b1cf4d97..090fed46 100644 --- a/Grid/algorithms/LinearOperator.h +++ b/Grid/algorithms/LinearOperator.h @@ -526,6 +526,7 @@ public: (*this)(Linop,in[k],out[k]); } }; + virtual ~OperatorFunction(){}; }; template class LinearFunction { diff --git a/Grid/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h index b332a2c3..122dfb9c 100644 --- a/Grid/qcd/action/ActionParams.h +++ b/Grid/qcd/action/ActionParams.h @@ -34,15 +34,6 @@ directory NAMESPACE_BEGIN(Grid); -// These can move into a params header and be given MacroMagic serialisation -struct DefaultImplParams { - Coordinate dirichlet; // Blocksize of dirichlet BCs - int partialDirichlet; - DefaultImplParams() { - dirichlet.resize(0); - partialDirichlet=0; - }; -}; struct GparityWilsonImplParams { Coordinate twists; diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index ff8a6433..da230e7e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -52,6 +52,16 @@ NAMESPACE_BEGIN(Grid); +// These can move into a params header and be given MacroMagic serialisation +struct DefaultImplParams { + Coordinate dirichlet; // Blocksize of dirichlet BCs + int partialDirichlet; + DefaultImplParams() { + dirichlet.resize(0); + partialDirichlet=0; + }; +}; + /////////////////////////////////////////////////////////////////// // Gather for when there *is* need to SIMD split with compression /////////////////////////////////////////////////////////////////// diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 99633cee..58fdc6ce 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -143,7 +143,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexD DoublePrecision2; }; -#ifdef GRID_CUDA +#if defined(GRID_CUDA) || defined(GRID_HIP) template<> struct GridTypeMapper > : public GridTypeMapper_Base { typedef std::complex scalar_type; typedef std::complex scalar_typeD; diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index ca1d6348..bd825ab3 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -1,4 +1,6 @@ +CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` ../../configure --enable-comms=mpi-auto \ +--with-lime=$CLIME \ --enable-unified=no \ --enable-shm=nvlink \ --enable-accelerator=hip \ @@ -7,7 +9,7 @@ --with-gmp=$OLCF_GMP_ROOT \ --with-fftw=$FFTW_DIR/.. \ --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ ---disable-gparity \ +--disable-fermion-reps \ CXX=hipcc MPICXX=mpicxx \ CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib64 " \ LDFLAGS="-L/lib64 -L/opt/rocm-5.2.0/lib/ -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 " diff --git a/systems/Crusher/sourceme.sh b/systems/Crusher/sourceme.sh index 42e15fb1..ac248b93 100644 --- a/systems/Crusher/sourceme.sh +++ b/systems/Crusher/sourceme.sh @@ -1,3 +1,5 @@ +. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh +spack load c-lime export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib module load emacs #module load gperftools diff --git a/tests/debug/Test_cayley_cg.cc b/tests/debug/Test_cayley_cg.cc index 5418a8af..74492fd9 100644 --- a/tests/debug/Test_cayley_cg.cc +++ b/tests/debug/Test_cayley_cg.cc @@ -125,10 +125,10 @@ int main (int argc, char ** argv) std::cout<(Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); - TestReconstruct5DFA(Ddwf,DdwfF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestCGinversions(Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestReconstruct5DFA(Ddwf,DdwfF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); RealD b=1.5;// Scale factor b+c=2, b-c=1 RealD c=0.5; @@ -137,54 +137,54 @@ int main (int argc, char ** argv) std::cout<(Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); - TestReconstruct5DFA(Dmob,DmobF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestCGinversions(Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestReconstruct5DFA(Dmob,DmobF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); - TestReconstruct5D(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + ZMobiusFermionD ZDmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,gamma,b,c); + TestCGinversions(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestReconstruct5D(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dzolo,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); - TestReconstruct5D(Dzolo,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + MobiusZolotarevFermionD Dzolo(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,0.1,2.0); + TestCGinversions(Dzolo,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestReconstruct5D(Dzolo,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); - TestReconstruct5DFA(Dsham,DshamF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestCGinversions(Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestReconstruct5DFA(Dsham,DshamF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dshamz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); - TestReconstruct5D(Dshamz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + ShamirZolotarevFermionD Dshamz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0); + TestCGinversions(Dshamz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestReconstruct5D(Dshamz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dov,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); - TestReconstruct5DFA(Dov,DovF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestCGinversions(Dov,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestReconstruct5DFA(Dov,DovF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dovz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); - TestReconstruct5D(Dovz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonCayleyZolotarevFermionD Dovz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0); + TestCGinversions(Dovz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestReconstruct5D(Dovz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); Grid_finalize(); } diff --git a/tests/debug/Test_cayley_coarsen_support.cc b/tests/debug/Test_cayley_coarsen_support.cc index b2f691d7..2190a9b0 100644 --- a/tests/debug/Test_cayley_coarsen_support.cc +++ b/tests/debug/Test_cayley_coarsen_support.cc @@ -95,8 +95,8 @@ int main (int argc, char ** argv) RealD mass=0.5; RealD M5=1.8; - DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); - Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); HermIndefOp.Op(src,ref); HermIndefOp.OpDiag(src,result); @@ -118,7 +118,7 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); + MdagMLinearOperator HermDefOp(Ddwf); typedef Aggregation Subspace; Subspace Aggregates(Coarse5d,FGrid,cb); diff --git a/tests/debug/Test_cayley_even_odd.cc b/tests/debug/Test_cayley_even_odd.cc index 5e800b26..b6eecc0f 100644 --- a/tests/debug/Test_cayley_even_odd.cc +++ b/tests/debug/Test_cayley_even_odd.cc @@ -76,41 +76,41 @@ int main (int argc, char ** argv) RealD mass=0.1; RealD M5 =1.8; std::cout<(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + TestWhat(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); RealD b=1.5;// Scale factor b+c=2, b-c=1 RealD c=0.5; std::vector gamma(Ls,ComplexD(1.0,0.1)); std::cout<(Dmob,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + MobiusFermionD Dmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); + TestWhat(Dmob,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(ZDmob,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + ZMobiusFermionD ZDmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,gamma,b,c); + TestWhat(ZDmob,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dzolo,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + MobiusZolotarevFermionD Dzolo(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,0.1,2.0); + TestWhat(Dzolo,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dsham,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + ScaledShamirFermionD Dsham(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,2.0); + TestWhat(Dsham,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dshamz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + ShamirZolotarevFermionD Dshamz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0); + TestWhat(Dshamz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dov,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonCayleyTanhFermionD Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); + TestWhat(Dov,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dovz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonCayleyZolotarevFermionD Dovz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0); + TestWhat(Dovz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); Grid_finalize(); } diff --git a/tests/debug/Test_cayley_ldop_cr.cc b/tests/debug/Test_cayley_ldop_cr.cc index 416017e5..997b8df5 100644 --- a/tests/debug/Test_cayley_ldop_cr.cc +++ b/tests/debug/Test_cayley_ldop_cr.cc @@ -83,8 +83,8 @@ int main (int argc, char ** argv) std::cout< HermIndefOp(Ddwf); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); const int nbasis = 8; @@ -95,7 +95,7 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); + MdagMLinearOperator HermDefOp(Ddwf); Subspace Aggregates(Coarse5d,FGrid,cb); Aggregates.CreateSubspace(RNG5,HermDefOp); diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc index bfbc3cf7..26d3dc60 100644 --- a/tests/debug/Test_cayley_mres.cc +++ b/tests/debug/Test_cayley_mres.cc @@ -128,8 +128,8 @@ int main (int argc, char ** argv) std::cout<(Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + TestConserved(Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); RealD b=1.5;// Scale factor b+c=2, b-c=1 RealD c=0.5; @@ -138,23 +138,23 @@ int main (int argc, char ** argv) std::cout<(Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + MobiusFermionD Dmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); + TestConserved(Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + ScaledShamirFermionD Dsham(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,2.0); + TestConserved(Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5,&ZDmobrev); + ZMobiusFermionD ZDmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,omegas,b,c); + ZMobiusFermionD ZDmobrev(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,omegasrev,b,c); + TestConserved(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5,&ZDmobrev); Grid_finalize(); } @@ -290,7 +290,7 @@ void TestConserved(Action & Ddwf, const RealD DmuPAmu{real(TensorRemove(sumPA[t]-sumPA[(t-1+Nt)%Nt]))}; std::cout< sumPAref; @@ -565,8 +565,8 @@ void TestConserved1(Action & Ddwf, Action & Ddwfrev, std::cout <<" PAc action "<oSites(),{ diff --git a/tests/debug/Test_heatbath_dwf_eofa.cc b/tests/debug/Test_heatbath_dwf_eofa.cc index e1c18021..5920054d 100644 --- a/tests/debug/Test_heatbath_dwf_eofa.cc +++ b/tests/debug/Test_heatbath_dwf_eofa.cc @@ -77,8 +77,8 @@ int main(int argc, char** argv) LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4, Umu); - DomainWallEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5); - DomainWallEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5); + DomainWallEOFAFermionD Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5); + DomainWallEOFAFermionD Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5); // Construct the action and test the heatbath (zero initial guess) { diff --git a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc index 7eabfc65..982f35db 100644 --- a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc +++ b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc @@ -41,7 +41,7 @@ using namespace Grid; ; typedef GparityWilsonImplR FermionImplPolicy; -typedef GparityDomainWallEOFAFermionR FermionAction; +typedef GparityDomainWallEOFAFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; // Parameters for test @@ -82,7 +82,7 @@ int main(int argc, char** argv) LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4, Umu); - // GparityDomainWallFermionR::ImplParams params; + // GparityDomainWallFermionD::ImplParams params; FermionAction::ImplParams params; FermionAction Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5, params); FermionAction Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5, params); diff --git a/tests/debug/Test_heatbath_mobius_eofa.cc b/tests/debug/Test_heatbath_mobius_eofa.cc index 48806642..3824daab 100644 --- a/tests/debug/Test_heatbath_mobius_eofa.cc +++ b/tests/debug/Test_heatbath_mobius_eofa.cc @@ -79,8 +79,8 @@ int main(int argc, char** argv) LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4, Umu); - MobiusEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5, b, c); - MobiusEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5, b, c); + MobiusEOFAFermionD Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5, b, c); // Construct the action and test the heatbath (zero initial guess) { diff --git a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc index 52447e5e..fd3d96f8 100644 --- a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc +++ b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc @@ -41,7 +41,7 @@ using namespace Grid; ; typedef GparityWilsonImplR FermionImplPolicy; -typedef GparityMobiusEOFAFermionR FermionAction; +typedef GparityMobiusEOFAFermionD FermionAction; typedef typename FermionAction::FermionField FermionField; // Parameters for test diff --git a/tests/debug/Test_reweight_dwf_eofa.cc b/tests/debug/Test_reweight_dwf_eofa.cc index a150b18f..6a5452c7 100644 --- a/tests/debug/Test_reweight_dwf_eofa.cc +++ b/tests/debug/Test_reweight_dwf_eofa.cc @@ -105,10 +105,10 @@ int main(int argc, char **argv) SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators - DomainWallFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5); - DomainWallFermionR Ddwf_b(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, M5); - SchurDiagMooeeOperator MdagM(Ddwf_f); - SchurDiagMooeeOperator VdagV(Ddwf_b); + DomainWallFermionD Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5); + DomainWallFermionD Ddwf_b(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, M5); + SchurDiagMooeeOperator MdagM(Ddwf_f); + SchurDiagMooeeOperator VdagV(Ddwf_b); // Degree 12 rational approximations to x^(1/4) and x^(-1/4) double lo = 0.0001; @@ -153,10 +153,10 @@ int main(int argc, char **argv) RealD shift_L = 0.0; RealD shift_R = -1.0; int pm = 1; - DomainWallEOFAFermionR Deofa_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5); - DomainWallEOFAFermionR Deofa_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5); - MdagMLinearOperator LdagL(Deofa_L); - MdagMLinearOperator RdagR(Deofa_R); + DomainWallEOFAFermionD Deofa_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5); + DomainWallEOFAFermionD Deofa_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5); + MdagMLinearOperator LdagL(Deofa_L); + MdagMLinearOperator RdagR(Deofa_R); // Stochastically estimate reweighting factor via EOFA RealD k = Deofa_L.k; diff --git a/tests/debug/Test_reweight_dwf_eofa_gparity.cc b/tests/debug/Test_reweight_dwf_eofa_gparity.cc index df2d95a0..70ae94aa 100644 --- a/tests/debug/Test_reweight_dwf_eofa_gparity.cc +++ b/tests/debug/Test_reweight_dwf_eofa_gparity.cc @@ -33,7 +33,7 @@ using namespace std; using namespace Grid; ; -typedef typename GparityDomainWallFermionR::FermionField FermionField; +typedef typename GparityDomainWallFermionD::FermionField FermionField; // parameters for test const std::vector grid_dim = { 8, 8, 8, 8 }; @@ -107,11 +107,11 @@ int main(int argc, char **argv) SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators - GparityDomainWallFermionR::ImplParams params; - GparityDomainWallFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, params); - GparityDomainWallFermionR Ddwf_b(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, M5, params); - SchurDiagMooeeOperator MdagM(Ddwf_f); - SchurDiagMooeeOperator VdagV(Ddwf_b); + GparityDomainWallFermionD::ImplParams params; + GparityDomainWallFermionD Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, params); + GparityDomainWallFermionD Ddwf_b(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, M5, params); + SchurDiagMooeeOperator MdagM(Ddwf_f); + SchurDiagMooeeOperator VdagV(Ddwf_b); // Degree 12 rational approximations to x^(1/4) and x^(-1/4) double lo = 0.0001; @@ -156,10 +156,10 @@ int main(int argc, char **argv) RealD shift_L = 0.0; RealD shift_R = -1.0; int pm = 1; - GparityDomainWallEOFAFermionR Deofa_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5, params); - GparityDomainWallEOFAFermionR Deofa_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5, params); - MdagMLinearOperator LdagL(Deofa_L); - MdagMLinearOperator RdagR(Deofa_R); + GparityDomainWallEOFAFermionD Deofa_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5, params); + GparityDomainWallEOFAFermionD Deofa_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5, params); + MdagMLinearOperator LdagL(Deofa_L); + MdagMLinearOperator RdagR(Deofa_R); // Stochastically estimate reweighting factor via EOFA RealD k = Deofa_L.k; diff --git a/tests/debug/Test_reweight_mobius_eofa.cc b/tests/debug/Test_reweight_mobius_eofa.cc index 88ecab7d..744dd302 100644 --- a/tests/debug/Test_reweight_mobius_eofa.cc +++ b/tests/debug/Test_reweight_mobius_eofa.cc @@ -107,10 +107,10 @@ int main(int argc, char **argv) SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators - MobiusFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, b, c); - MobiusFermionR Ddwf_b(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, M5, b, c); - SchurDiagMooeeOperator MdagM(Ddwf_f); - SchurDiagMooeeOperator VdagV(Ddwf_b); + MobiusFermionD Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, b, c); + MobiusFermionD Ddwf_b(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, M5, b, c); + SchurDiagMooeeOperator MdagM(Ddwf_f); + SchurDiagMooeeOperator VdagV(Ddwf_b); // Degree 12 rational approximations to x^(1/4) and x^(-1/4) double lo = 0.0001; @@ -155,10 +155,10 @@ int main(int argc, char **argv) RealD shift_L = 0.0; RealD shift_R = -1.0; int pm = 1; - MobiusEOFAFermionR Deofa_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5, b, c); - MobiusEOFAFermionR Deofa_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5, b, c); - MdagMLinearOperator LdagL(Deofa_L); - MdagMLinearOperator RdagR(Deofa_R); + MobiusEOFAFermionD Deofa_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5, b, c); + MobiusEOFAFermionD Deofa_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5, b, c); + MdagMLinearOperator LdagL(Deofa_L); + MdagMLinearOperator RdagR(Deofa_R); // Stochastically estimate reweighting factor via EOFA RealD k = Deofa_L.k; diff --git a/tests/debug/Test_reweight_mobius_eofa_gparity.cc b/tests/debug/Test_reweight_mobius_eofa_gparity.cc index 31708265..e2a4fb47 100644 --- a/tests/debug/Test_reweight_mobius_eofa_gparity.cc +++ b/tests/debug/Test_reweight_mobius_eofa_gparity.cc @@ -33,7 +33,7 @@ using namespace std; using namespace Grid; ; -typedef typename GparityDomainWallFermionR::FermionField FermionField; +typedef typename GparityDomainWallFermionD::FermionField FermionField; // parameters for test const std::vector grid_dim = { 8, 8, 8, 8 }; @@ -109,11 +109,11 @@ int main(int argc, char **argv) SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators - GparityDomainWallFermionR::ImplParams params; - GparityMobiusFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, b, c, params); - GparityMobiusFermionR Ddwf_b(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, M5, b, c, params); - SchurDiagMooeeOperator MdagM(Ddwf_f); - SchurDiagMooeeOperator VdagV(Ddwf_b); + GparityDomainWallFermionD::ImplParams params; + GparityMobiusFermionD Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, b, c, params); + GparityMobiusFermionD Ddwf_b(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, M5, b, c, params); + SchurDiagMooeeOperator MdagM(Ddwf_f); + SchurDiagMooeeOperator VdagV(Ddwf_b); // Degree 12 rational approximations to x^(1/4) and x^(-1/4) double lo = 0.0001; @@ -158,10 +158,10 @@ int main(int argc, char **argv) RealD shift_L = 0.0; RealD shift_R = -1.0; int pm = 1; - GparityMobiusEOFAFermionR Deofa_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5, b, c, params); - GparityMobiusEOFAFermionR Deofa_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5, b, c, params); - MdagMLinearOperator LdagL(Deofa_L); - MdagMLinearOperator RdagR(Deofa_R); + GparityMobiusEOFAFermionD Deofa_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5, b, c, params); + GparityMobiusEOFAFermionD Deofa_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5, b, c, params); + MdagMLinearOperator LdagL(Deofa_L); + MdagMLinearOperator RdagR(Deofa_R); // Stochastically estimate reweighting factor via EOFA RealD k = Deofa_L.k; diff --git a/tests/hmc/Test_hmc_WC2ASFG_Production.cc b/tests/hmc/Test_hmc_WC2ASFG_Production.cc index 55c84c0a..90f43ede 100644 --- a/tests/hmc/Test_hmc_WC2ASFG_Production.cc +++ b/tests/hmc/Test_hmc_WC2ASFG_Production.cc @@ -28,7 +28,7 @@ directory /* END LEGAL */ #include - +#ifdef ENABLE_FERMION_REPS namespace Grid{ struct FermionParameters: Serializable { GRID_SERIALIZABLE_CLASS_MEMBERS(FermionParameters, @@ -210,4 +210,6 @@ int main(int argc, char **argv) Grid_finalize(); } // main - +#else +int main(int argc, char **argv){} +#endif diff --git a/tests/hmc/Test_hmc_WC2SFG_Production.cc b/tests/hmc/Test_hmc_WC2SFG_Production.cc index 80f7baaf..16ca05a0 100644 --- a/tests/hmc/Test_hmc_WC2SFG_Production.cc +++ b/tests/hmc/Test_hmc_WC2SFG_Production.cc @@ -29,6 +29,7 @@ directory #include +#ifdef ENABLE_FERMION_REPS namespace Grid{ struct FermionParameters: Serializable { GRID_SERIALIZABLE_CLASS_MEMBERS(FermionParameters, @@ -211,3 +212,6 @@ int main(int argc, char **argv) Grid_finalize(); } // main +#else +int main(int argc, char **argv){} +#endif diff --git a/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc b/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc index ac017860..3be63a15 100644 --- a/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc +++ b/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc @@ -32,6 +32,7 @@ directory #include "Grid/Grid.h" +#ifdef ENABLE_FERMION_REPS namespace Grid{ struct FermionParameters: Serializable { GRID_SERIALIZABLE_CLASS_MEMBERS(FermionParameters, @@ -222,3 +223,6 @@ int main(int argc, char **argv) { Grid_finalize(); } // main +#else +int main(int argc, char **argv){} +#endif diff --git a/tests/hmc/Test_hmc_WCadjFG_Production.cc b/tests/hmc/Test_hmc_WCadjFG_Production.cc index a9673e6c..f0e2742d 100644 --- a/tests/hmc/Test_hmc_WCadjFG_Production.cc +++ b/tests/hmc/Test_hmc_WCadjFG_Production.cc @@ -29,6 +29,7 @@ directory #include +#ifdef ENABLE_FERMION_REPS namespace Grid{ struct FermionParameters: Serializable { GRID_SERIALIZABLE_CLASS_MEMBERS(FermionParameters, @@ -211,3 +212,6 @@ int main(int argc, char **argv) } // main +#else +int main(int argc, char **argv){} +#endif diff --git a/tests/hmc/Test_hmc_WilsonAdjointFermionGauge.cc b/tests/hmc/Test_hmc_WilsonAdjointFermionGauge.cc index 774c9037..3d601d25 100644 --- a/tests/hmc/Test_hmc_WilsonAdjointFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonAdjointFermionGauge.cc @@ -31,9 +31,10 @@ directory /* END LEGAL */ #include "Grid/Grid.h" +#ifdef ENABLE_FERMION_REPS + int main(int argc, char **argv) { using namespace Grid; - ; // Here change the allowed (higher) representations typedef Representations< FundamentalRepresentation, AdjointRepresentation > TheRepresentations; @@ -127,3 +128,6 @@ int main(int argc, char **argv) { } // main +#else +int main(int argc, char **argv){} +#endif diff --git a/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc b/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc index 60d760a6..66a325f2 100644 --- a/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc @@ -33,6 +33,7 @@ directory +#ifdef ENABLE_FERMION_REPS int main(int argc, char **argv) { #ifndef GRID_CUDA @@ -138,3 +139,6 @@ int main(int argc, char **argv) { } // main +#else +int main(int argc, char **argv){} +#endif diff --git a/tests/hmc/Test_hmc_WilsonTwoIndexSymmetricFermionGauge.cc b/tests/hmc/Test_hmc_WilsonTwoIndexSymmetricFermionGauge.cc index bdcd27c5..4769e396 100644 --- a/tests/hmc/Test_hmc_WilsonTwoIndexSymmetricFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonTwoIndexSymmetricFermionGauge.cc @@ -29,6 +29,7 @@ directory /* END LEGAL */ #include "Grid/Grid.h" +#ifdef ENABLE_FERMION_REPS int main(int argc, char **argv) { using namespace Grid; ; @@ -127,3 +128,6 @@ int main(int argc, char **argv) { } // main +#else +int main(int argc, char **argv){} +#endif diff --git a/tests/lanczos/Test_WCMultiRep_lanczos.cc b/tests/lanczos/Test_WCMultiRep_lanczos.cc index 58759c96..0bfc75be 100644 --- a/tests/lanczos/Test_WCMultiRep_lanczos.cc +++ b/tests/lanczos/Test_WCMultiRep_lanczos.cc @@ -28,19 +28,21 @@ directory /* END LEGAL */ #include +#ifdef ENABLE_FERMION_REPS + using namespace std; using namespace Grid; -//typedef WilsonCloverFermionR FermionOp; -//typedef typename WilsonFermionR::FermionField FermionField; +//typedef WilsonCloverFermionD FermionOp; +//typedef typename WilsonFermionD::FermionField FermionField; typedef WilsonImplR FundImplPolicy; -typedef WilsonCloverFermionR FundFermionAction; +typedef WilsonCloverFermionD FundFermionAction; typedef typename FundFermionAction::FermionField FundFermionField; typedef WilsonTwoIndexAntiSymmetricImplR ASymmImplPolicy; -typedef WilsonCloverTwoIndexAntiSymmetricFermionR ASymmFermionAction; +typedef WilsonCloverTwoIndexAntiSymmetricFermionD ASymmFermionAction; typedef typename ASymmFermionAction::FermionField ASymmFermionField; @@ -175,3 +177,6 @@ NerscHmcCheckpointer Checkpoint(CPparams); Grid_finalize(); } +#else +int main(int argc,char **argv){ return 0;}; +#endif diff --git a/tests/lanczos/Test_compressed_lanczos.cc b/tests/lanczos/Test_compressed_lanczos.cc index d7d0d52d..28df3f99 100644 --- a/tests/lanczos/Test_compressed_lanczos.cc +++ b/tests/lanczos/Test_compressed_lanczos.cc @@ -188,8 +188,8 @@ int main (int argc, char ** argv) { std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; // ZMobius EO Operator - ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.); - SchurDiagTwoOperator HermOp(Ddwf); + ZMobiusFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.); + SchurDiagTwoOperator HermOp(Ddwf); // Eigenvector storage LanczosParams fine =Params.FineParams; diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc index 669a7b6d..7a84a465 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc @@ -188,8 +188,8 @@ int main (int argc, char ** argv) { std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; // ZMobius EO Operator - ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.); - SchurDiagTwoOperator HermOp(Ddwf); + ZMobiusFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.); + SchurDiagTwoOperator HermOp(Ddwf); // Eigenvector storage LanczosParams fine =Params.FineParams; diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc index f3cb567c..e82a9741 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc @@ -301,8 +301,8 @@ int main (int argc, char ** argv) { std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; // ZMobius EO Operator - ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.); - SchurDiagTwoOperator HermOp(Ddwf); + ZMobiusFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.); + SchurDiagTwoOperator HermOp(Ddwf); // Eigenvector storage LanczosParams fine =Params.FineParams; diff --git a/tests/lanczos/Test_dwf_lanczos.cc b/tests/lanczos/Test_dwf_lanczos.cc index 1fe29bb2..1723e756 100644 --- a/tests/lanczos/Test_dwf_lanczos.cc +++ b/tests/lanczos/Test_dwf_lanczos.cc @@ -35,8 +35,8 @@ template struct Setup{}; template<> -struct Setup{ - static GparityMobiusFermionR* getAction(LatticeGaugeField &Umu, +struct Setup{ + static GparityMobiusFermionD* getAction(LatticeGaugeField &Umu, GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){ RealD mass=0.01; RealD M5=1.8; @@ -44,17 +44,17 @@ struct Setup{ GparityMobiusFermionD ::ImplParams params; std::vector twists({1,1,1,0}); params.twists = twists; - return new GparityMobiusFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params); + return new GparityMobiusFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params); } }; template<> -struct Setup{ - static DomainWallFermionR* getAction(LatticeGaugeField &Umu, +struct Setup{ + static DomainWallFermionD* getAction(LatticeGaugeField &Umu, GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){ RealD mass=0.01; RealD M5=1.8; - return new DomainWallFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + return new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); } }; @@ -127,9 +127,9 @@ int main (int argc, char ** argv) } if(action == "GparityMobius"){ - run(); + run(); }else if(action == "DWF"){ - run(); + run(); }else{ std::cout << "Unknown action" << std::endl; exit(1); diff --git a/tests/lanczos/Test_wilson_lanczos.cc b/tests/lanczos/Test_wilson_lanczos.cc index af21fb1d..4814e8c6 100644 --- a/tests/lanczos/Test_wilson_lanczos.cc +++ b/tests/lanczos/Test_wilson_lanczos.cc @@ -32,8 +32,8 @@ using namespace std; using namespace Grid; ; -typedef WilsonFermionR FermionOp; -typedef typename WilsonFermionR::FermionField FermionField; +typedef WilsonFermionD FermionOp; +typedef typename WilsonFermionD::FermionField FermionField; RealD AllZero(RealD x) { return 0.; } diff --git a/tests/smearing/Test_WilsonFlow.cc b/tests/smearing/Test_WilsonFlow.cc index f339959a..e0726f87 100644 --- a/tests/smearing/Test_WilsonFlow.cc +++ b/tests/smearing/Test_WilsonFlow.cc @@ -96,13 +96,16 @@ int main(int argc, char **argv) { std::cout << GridLogMessage << "Initial plaquette: " << WilsonLoops::avgPlaquette(Umu) << std::endl; - WilsonFlow WF(WFPar.steps, WFPar.step_size, WFPar.meas_interval); + int t=WFPar.maxTau; + WilsonFlowAdaptive WF(WFPar.step_size, WFPar.maxTau, + 1.0e-4, + WFPar.meas_interval); - WF.smear_adaptive(Uflow, Umu, WFPar.maxTau); + WF.smear(Uflow, Umu); RealD WFlow_plaq = WilsonLoops::avgPlaquette(Uflow); RealD WFlow_TC = WilsonLoops::TopologicalCharge(Uflow); - RealD WFlow_T0 = WF.energyDensityPlaquette(Uflow); + RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl; std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl; std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl; diff --git a/tests/solver/Test_cf_cr_unprec.cc b/tests/solver/Test_cf_cr_unprec.cc index aa750175..3addcb62 100644 --- a/tests/solver/Test_cf_cr_unprec.cc +++ b/tests/solver/Test_cf_cr_unprec.cc @@ -71,14 +71,14 @@ int main (int argc, char ** argv) RealD mass=0.1; RealD M5=1.8; - OverlapWilsonContFracTanhFermionR Dcf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); + OverlapWilsonContFracTanhFermionD Dcf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); ConjugateResidual MCR(1.0e-8,10000); - MdagMLinearOperator HermPosDefOp(Dcf); + MdagMLinearOperator HermPosDefOp(Dcf); MCR(HermPosDefOp,src,result); - HermitianLinearOperator HermIndefOp(Dcf); + HermitianLinearOperator HermIndefOp(Dcf); MCR(HermIndefOp,src,result); Grid_finalize(); diff --git a/tests/solver/Test_coarse_even_odd.cc b/tests/solver/Test_coarse_even_odd.cc index c7127121..9d2f8c22 100644 --- a/tests/solver/Test_coarse_even_odd.cc +++ b/tests/solver/Test_coarse_even_odd.cc @@ -108,8 +108,8 @@ int main(int argc, char** argv) { RealD mass = -0.30; RealD csw = 1.9192; - WilsonCloverFermionR Dwc(Umu, *Grid_f, *RBGrid_f, mass, csw, csw); - MdagMLinearOperator MdagMOp_Dwc(Dwc); + WilsonCloverFermionD Dwc(Umu, *Grid_f, *RBGrid_f, mass, csw, csw); + MdagMLinearOperator MdagMOp_Dwc(Dwc); ///////////////////////////////////////////////////////////////////////////// // Type definitions // diff --git a/tests/solver/Test_contfrac_cg.cc b/tests/solver/Test_contfrac_cg.cc index afabae4c..52599d07 100644 --- a/tests/solver/Test_contfrac_cg.cc +++ b/tests/solver/Test_contfrac_cg.cc @@ -102,21 +102,21 @@ int main (int argc, char ** argv) std::cout<(Dcf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonContFracTanhFermionD Dcf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); + TestCGinversions(Dcf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dcfz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonContFracZolotarevFermionD Dcfz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,6.0); + TestCGinversions(Dcfz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dpf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonPartialFractionTanhFermionD Dpf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0); + TestCGinversions(Dpf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dpfz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + OverlapWilsonPartialFractionZolotarevFermionD Dpfz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,6.0); + TestCGinversions(Dpfz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); Grid_finalize(); diff --git a/tests/solver/Test_dwf_cg_prec.cc b/tests/solver/Test_dwf_cg_prec.cc index debb736a..f4e346bf 100644 --- a/tests/solver/Test_dwf_cg_prec.cc +++ b/tests/solver/Test_dwf_cg_prec.cc @@ -79,7 +79,7 @@ int main(int argc, char** argv) { RealD mass = 0.01; RealD M5 = 1.8; - DomainWallFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5); + DomainWallFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5); LatticeFermion src_o(FrbGrid); LatticeFermion result_o(FrbGrid); @@ -88,7 +88,7 @@ int main(int argc, char** argv) { GridStopWatch CGTimer; - SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); ConjugateGradient CG(1.0e-5, 10000, 0);// switch off the assert CGTimer.Start(); @@ -98,8 +98,5 @@ int main(int argc, char** argv) { std::cout << GridLogMessage << "Total CG time : " << CGTimer.Elapsed() << std::endl; - std::cout << GridLogMessage << "######## Dhop calls summary" << std::endl; - Ddwf.Report(); - Grid_finalize(); } diff --git a/tests/solver/Test_dwf_cg_schur.cc b/tests/solver/Test_dwf_cg_schur.cc index 6541e73d..bcc0cc40 100644 --- a/tests/solver/Test_dwf_cg_schur.cc +++ b/tests/solver/Test_dwf_cg_schur.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) RealD mass=0.1; RealD M5=1.8; - DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); ConjugateGradient CG(1.0e-8,10000); SchurRedBlackDiagMooeeSolve SchurSolver(CG); diff --git a/tests/solver/Test_dwf_cg_unprec.cc b/tests/solver/Test_dwf_cg_unprec.cc index c867ccf3..58614c49 100644 --- a/tests/solver/Test_dwf_cg_unprec.cc +++ b/tests/solver/Test_dwf_cg_unprec.cc @@ -70,9 +70,9 @@ int main (int argc, char ** argv) RealD mass=0.1; RealD M5=1.8; - DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); - MdagMLinearOperator HermOp(Ddwf); + MdagMLinearOperator HermOp(Ddwf); ConjugateGradient CG(1.0e-6,10000); CG(HermOp,src,result); diff --git a/tests/solver/Test_dwf_cr_unprec.cc b/tests/solver/Test_dwf_cr_unprec.cc index 8c8583ba..4d67231d 100644 --- a/tests/solver/Test_dwf_cr_unprec.cc +++ b/tests/solver/Test_dwf_cr_unprec.cc @@ -77,12 +77,12 @@ int main (int argc, char ** argv) RealD mass=0.5; RealD M5=1.8; - DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); - MdagMLinearOperator HermOp(Ddwf); + MdagMLinearOperator HermOp(Ddwf); MCR(HermOp,src,result); - Gamma5R5HermitianLinearOperator g5HermOp(Ddwf); + Gamma5R5HermitianLinearOperator g5HermOp(Ddwf); MCR(g5HermOp,src,result); diff --git a/tests/solver/Test_dwf_fpgcr.cc b/tests/solver/Test_dwf_fpgcr.cc index 42cc8de1..3d779474 100644 --- a/tests/solver/Test_dwf_fpgcr.cc +++ b/tests/solver/Test_dwf_fpgcr.cc @@ -77,12 +77,12 @@ int main (int argc, char ** argv) RealD mass=0.5; RealD M5=1.8; - DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); std::cout< HermOp(Ddwf); + MdagMLinearOperator HermOp(Ddwf); TrivialPrecon simple; PrecGeneralisedConjugateResidual PGCR(1.0e-6,10000,HermOp,simple,4,160); @@ -92,7 +92,7 @@ int main (int argc, char ** argv) std::cout< g5HermOp(Ddwf); + Gamma5R5HermitianLinearOperator g5HermOp(Ddwf); PrecGeneralisedConjugateResidual PGCR5(1.0e-6,10000,g5HermOp,simple,4,160); result=Zero(); PGCR5(src,result); diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index ba77dffa..31b58284 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -254,7 +254,7 @@ int main (int argc, char ** argv) std::cout< Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -263,7 +263,7 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); + MdagMLinearOperator HermDefOp(Ddwf); Subspace Aggregates(Coarse5d,FGrid,0); @@ -290,7 +290,7 @@ int main (int argc, char ** argv) typedef CoarsenedMatrix Level1Op; typedef CoarsenedMatrix,nbasisc> Level2Op; - Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); @@ -351,16 +351,16 @@ int main (int argc, char ** argv) std::cout< , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; - typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space ChebyshevSmoother CoarseSmoother(0.1,12.0,3,L1LinOp,LDOp); - ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); + ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); // MirsSmoother CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); - // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); CoarseMG Level2Precon (CoarseAggregates, L2Op, L1LinOp,LDOp, diff --git a/tests/solver/Test_dwf_hdcr_16_rb.cc b/tests/solver/Test_dwf_hdcr_16_rb.cc index 4682272d..ae8e7ae5 100644 --- a/tests/solver/Test_dwf_hdcr_16_rb.cc +++ b/tests/solver/Test_dwf_hdcr_16_rb.cc @@ -268,7 +268,7 @@ int main (int argc, char ** argv) std::cout< Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -277,7 +277,7 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); + MdagMLinearOperator HermDefOp(Ddwf); Subspace Aggregates(Coarse5d,FGrid,0); @@ -311,7 +311,7 @@ int main (int argc, char ** argv) typedef CoarsenedMatrix Level1Op; typedef CoarsenedMatrix,nbasisc> Level2Op; - Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); @@ -338,11 +338,11 @@ int main (int argc, char ** argv) std::cout< , SolverWrapper > TwoLevelMG; + typedef MultiGridPreconditioner , SolverWrapper > TwoLevelMG; typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; - typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; - ChebyshevSmoother FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); + ChebyshevSmoother FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); std::cout< HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); // pCG(HermOpEO,src_o,result_o); std::cout< Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -259,7 +259,7 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); + MdagMLinearOperator HermDefOp(Ddwf); Subspace Aggregates(Coarse5d,FGrid,0); @@ -292,7 +292,7 @@ int main (int argc, char ** argv) typedef CoarsenedMatrix Level1Op; typedef CoarsenedMatrix,nbasisc> Level2Op; - Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); @@ -391,18 +391,18 @@ int main (int argc, char ** argv) std::cout< , NormalEquations > TwoLevelMG; - typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; + // typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; - typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; - ChebyshevSmoother FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf); + ChebyshevSmoother FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf); /* // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space ChebyshevSmoother CoarseSmoother(0.1,15.0,3,L1LinOp,LDOp); // MirsSmoother CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); - // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); CoarseMG Level2Precon (CoarseAggregates, L2Op, L1LinOp,LDOp, @@ -463,7 +463,7 @@ int main (int argc, char ** argv) LatticeFermion result_o(FrbGrid); pickCheckerboard(Odd,src_o,src); result_o=Zero(); - SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); // pCG(HermOpEO,src_o,result_o); std::cout< Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -297,7 +297,7 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); + MdagMLinearOperator HermDefOp(Ddwf); Subspace Aggregates(Coarse5d,FGrid,0); @@ -332,7 +332,7 @@ int main (int argc, char ** argv) std::cout< Level1Op; - Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); @@ -375,21 +375,21 @@ int main (int argc, char ** argv) std::cout< , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space - // ChebyshevSmoother FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 72 iter 63s - // ChebyshevSmoother FineSmoother(0.1,60.0,20,HermIndefOp,Ddwf); // 66 iter 69s - // ChebyshevSmoother FineSmoother(0.5,60.0,20,HermIndefOp,Ddwf); // 63 iter 65 s - // ChebyshevSmoother FineSmoother(1.0,60.0,20,HermIndefOp,Ddwf); // 69, 70 - // ChebyshevSmoother FineSmoother(1.0,60.0,14,HermIndefOp,Ddwf); // 77 + // ChebyshevSmoother FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 72 iter 63s + // ChebyshevSmoother FineSmoother(0.1,60.0,20,HermIndefOp,Ddwf); // 66 iter 69s + // ChebyshevSmoother FineSmoother(0.5,60.0,20,HermIndefOp,Ddwf); // 63 iter 65 s + // ChebyshevSmoother FineSmoother(1.0,60.0,20,HermIndefOp,Ddwf); // 69, 70 + // ChebyshevSmoother FineSmoother(1.0,60.0,14,HermIndefOp,Ddwf); // 77 - // ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); // 23 iter 15.9s - // ChebyshevSmoother FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 20, 16.9s - ChebyshevSmoother FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); // 21, 15.6s + // ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); // 23 iter 15.9s + // ChebyshevSmoother FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 20, 16.9s + ChebyshevSmoother FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); // 21, 15.6s - // MirsSmoother FineCGSmoother(0.05,0.01,20,HermIndefOp,Ddwf); - // RedBlackSmoother FineRBSmoother(0.00,0.001,100,Ddwf); + // MirsSmoother FineCGSmoother(0.05,0.01,20,HermIndefOp,Ddwf); + // RedBlackSmoother FineRBSmoother(0.00,0.001,100,Ddwf); // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space // ZeroGuesser CoarseZeroGuesser; @@ -416,7 +416,7 @@ int main (int argc, char ** argv) ConjugateGradient FineCG(1.0e-8,10000); - SchurDiagMooeeOperator FineDiagMooee(Ddwf); // M_ee - Meo Moo^-1 Moe + SchurDiagMooeeOperator FineDiagMooee(Ddwf); // M_ee - Meo Moo^-1 Moe LatticeFermion f_src_e(FrbGrid); f_src_e=1.0; LatticeFermion f_res_e(FrbGrid); f_res_e=Zero(); FineCG(FineDiagMooee,f_src_e,f_res_e); diff --git a/tests/solver/Test_dwf_hdcr_48_rb.cc b/tests/solver/Test_dwf_hdcr_48_rb.cc index 2b76681e..25ac1dac 100644 --- a/tests/solver/Test_dwf_hdcr_48_rb.cc +++ b/tests/solver/Test_dwf_hdcr_48_rb.cc @@ -264,7 +264,7 @@ int main (int argc, char ** argv) std::cout< Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -273,7 +273,7 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); + MdagMLinearOperator HermDefOp(Ddwf); Subspace Aggregates(Coarse5d,FGrid,0); @@ -306,7 +306,7 @@ int main (int argc, char ** argv) typedef CoarsenedMatrix Level1Op; typedef CoarsenedMatrix,nbasisc> Level2Op; - Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); @@ -332,9 +332,9 @@ int main (int argc, char ** argv) std::cout< , SolverWrapper > TwoLevelMG; + typedef MultiGridPreconditioner , SolverWrapper > TwoLevelMG; typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; - typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; std::cout< FineSmoother(los[l],60.0,ords[o],HermIndefOp,Ddwf); + ChebyshevSmoother FineSmoother(los[l],60.0,ords[o],HermIndefOp,Ddwf); ZeroGuesser CoarseZeroGuesser; ConjugateGradient CoarseCG(tols[t],10000); SchurRedBlackDiagMooeeSolve CoarseRBCG(CoarseCG); @@ -376,7 +376,7 @@ int main (int argc, char ** argv) LatticeFermion result_o(FrbGrid); pickCheckerboard(Odd,src_o,src); result_o=Zero(); - SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); pCG(HermOpEO,src_o,result_o); std::cout< Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -257,7 +257,7 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); + MdagMLinearOperator HermDefOp(Ddwf); Subspace Aggregates(Coarse5d,FGrid,0); @@ -290,7 +290,7 @@ int main (int argc, char ** argv) typedef CoarsenedMatrix Level1Op; typedef CoarsenedMatrix,nbasisc> Level2Op; - Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); @@ -386,18 +386,18 @@ int main (int argc, char ** argv) std::cout< , NormalEquations > TwoLevelMG; - typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; + // typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; - typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; - ChebyshevSmoother FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf); + ChebyshevSmoother FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf); /* // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space ChebyshevSmoother CoarseSmoother(0.1,15.0,3,L1LinOp,LDOp); // MirsSmoother CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); - // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); CoarseMG Level2Precon (CoarseAggregates, L2Op, L1LinOp,LDOp, @@ -458,7 +458,7 @@ int main (int argc, char ** argv) LatticeFermion result_o(FrbGrid); pickCheckerboard(Odd,src_o,src); result_o=Zero(); - SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); pCG(HermOpEO,src_o,result_o); std::cout< HermOp(Ddwf); - MdagMLinearOperator HermOpCk(Dchk); + MdagMLinearOperator HermOp(Ddwf); + MdagMLinearOperator HermOpCk(Dchk); ConjugateGradient CG((1.0e-5/(me+1)),10000); s_res = Zero(); CG(HermOp,s_src,s_res); diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index d0a32460..1a679f45 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -34,9 +34,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename DomainWallFermionR::FermionField FermionField; - typedef typename DomainWallFermionR::ComplexField ComplexField; - typename DomainWallFermionR::ImplParams params; + typedef typename DomainWallFermionD::FermionField FermionField; + typedef typename DomainWallFermionD::ComplexField ComplexField; + typename DomainWallFermionD::ImplParams params; double stp=1.0e-5; const int Ls=4; @@ -189,15 +189,15 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " Building the solvers"< HermOp(Ddwf); - MdagMLinearOperator HermOpCk(Dchk); + MdagMLinearOperator HermOp(Ddwf); + MdagMLinearOperator HermOpCk(Dchk); ConjugateGradient CG((stp),10000); s_res = Zero(); CG(HermOp,s_src,s_res); diff --git a/tests/solver/Test_dwf_mrhs_cg_mpieo.cc b/tests/solver/Test_dwf_mrhs_cg_mpieo.cc index 1906619f..0dcab577 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpieo.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpieo.cc @@ -34,9 +34,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename DomainWallFermionR::FermionField FermionField; - typedef typename DomainWallFermionR::ComplexField ComplexField; - typename DomainWallFermionR::ImplParams params; + typedef typename DomainWallFermionD::FermionField FermionField; + typedef typename DomainWallFermionD::ComplexField ComplexField; + typename DomainWallFermionD::ImplParams params; const int Ls=4; @@ -124,15 +124,15 @@ int main (int argc, char ** argv) /////////////////////////////////////////////////////////////// RealD mass=0.01; RealD M5=1.8; - DomainWallFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5); - DomainWallFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5); + DomainWallFermionD Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5); + DomainWallFermionD Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5); std::cout << GridLogMessage << "****************************************************************** "< HermOp(Ddwf); - MdagMLinearOperator HermOpCk(Dchk); + MdagMLinearOperator HermOp(Ddwf); + MdagMLinearOperator HermOpCk(Dchk); ConjugateGradient CG((1.0e-8/(me+1)),10000); s_res = Zero(); CG(HermOp,s_src,s_res); diff --git a/tests/solver/Test_dwf_multigrid.cc b/tests/solver/Test_dwf_multigrid.cc index e670b358..1cd83375 100644 --- a/tests/solver/Test_dwf_multigrid.cc +++ b/tests/solver/Test_dwf_multigrid.cc @@ -397,8 +397,8 @@ int main (int argc, char ** argv) std::cout< Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -407,7 +407,7 @@ int main (int argc, char ** argv) std::cout< HermDefOp(Ddwf); + MdagMLinearOperator HermDefOp(Ddwf); Subspace Aggregates(Coarse5d,FGrid,0); @@ -435,8 +435,8 @@ int main (int argc, char ** argv) typedef CoarsenedMatrix Level1Op; typedef CoarsenedMatrix,nbasisc> Level2Op; - Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); - Gamma5R5HermitianLinearOperator HermIndefOpPV(Dpv); + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + Gamma5R5HermitianLinearOperator HermIndefOpPV(Dpv); std::cout< FineCG(tol,MaxIt); // GeneralisedMinimalResidual FineGMRES(tol,MaxIt,20); - MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M - PVdagMLinearOperator FinePVdagM(Ddwf,Dpv);// M_{pv}^\dag M - SchurDiagMooeeOperator FineDiagMooee(Ddwf); // M_ee - Meo Moo^-1 Moe - SchurDiagOneOperator FineDiagOne(Ddwf); // 1 - M_ee^{-1} Meo Moo^{-1} Moe e + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + PVdagMLinearOperator FinePVdagM(Ddwf,Dpv);// M_{pv}^\dag M + SchurDiagMooeeOperator FineDiagMooee(Ddwf); // M_ee - Meo Moo^-1 Moe + SchurDiagOneOperator FineDiagOne(Ddwf); // 1 - M_ee^{-1} Meo Moo^{-1} Moe e MdagMLinearOperator CoarseMdagM(LDOp); PVdagMLinearOperator CoarsePVdagM(LDOp,LDOpPV); @@ -552,7 +552,7 @@ int main (int argc, char ** argv) std::cout< CoarseMgridCG(0.001,1000); - ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); + ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); typedef HDCRPreconditioner > TwoLevelHDCR; TwoLevelHDCR TwoLevelPrecon(Aggregates, diff --git a/tests/solver/Test_dwf_qmr_unprec.cc b/tests/solver/Test_dwf_qmr_unprec.cc index 370e7409..eeb20c95 100644 --- a/tests/solver/Test_dwf_qmr_unprec.cc +++ b/tests/solver/Test_dwf_qmr_unprec.cc @@ -66,17 +66,17 @@ int main (int argc, char ** argv) RealD mass=0.0; RealD M5=-1.8; - DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); - Gamma5R5HermitianLinearOperator g5HermOp(Ddwf); + Gamma5R5HermitianLinearOperator g5HermOp(Ddwf); QMR(g5HermOp,src,result); GMR(g5HermOp,src,result); - NonHermitianLinearOperator NonHermOp(Ddwf); + NonHermitianLinearOperator NonHermOp(Ddwf); QMR(NonHermOp,src,result); GMR(NonHermOp,src,result); - MdagMLinearOperator HermOp(Ddwf); + MdagMLinearOperator HermOp(Ddwf); ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src,result); diff --git a/tests/solver/Test_eofa_inv.cc b/tests/solver/Test_eofa_inv.cc index 564405c2..71952b97 100644 --- a/tests/solver/Test_eofa_inv.cc +++ b/tests/solver/Test_eofa_inv.cc @@ -83,8 +83,8 @@ int main (int argc, char** argv) RealD mf = 0.01; RealD mb = 1.0; RealD M5 = 1.8; - MobiusEOFAFermionR Lop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, 0.0, -1, M5, b, c); - MobiusEOFAFermionR Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5, b, c); + MobiusEOFAFermionD Lop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5, b, c); OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-10, 12); ConjugateGradient CG(1.0e-10, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false); diff --git a/tests/solver/Test_hw_multigrid.cc b/tests/solver/Test_hw_multigrid.cc index 66c88883..fd30bca7 100644 --- a/tests/solver/Test_hw_multigrid.cc +++ b/tests/solver/Test_hw_multigrid.cc @@ -292,9 +292,9 @@ int main (int argc, char ** argv) std::cout< Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -304,7 +304,7 @@ int main (int argc, char ** argv) std::cout< SubspaceOp(Dw); + MdagMLinearOperator SubspaceOp(Dw); Subspace Aggregates4D(Coarse4d,UGrid,0); Subspace Aggregates5D(Coarse5d,FGrid,0); @@ -335,7 +335,7 @@ int main (int argc, char ** argv) std::cout< Level1Op; - NonHermitianLinearOperator LinOpDwf(Ddwf); + NonHermitianLinearOperator LinOpDwf(Ddwf); Level1Op LDOp (*Coarse5d,*Coarse5dRB,0); diff --git a/tests/solver/Test_hw_multigrid_mixed_48.cc b/tests/solver/Test_hw_multigrid_mixed_48.cc index 0e8d6a17..3a31ddbe 100644 --- a/tests/solver/Test_hw_multigrid_mixed_48.cc +++ b/tests/solver/Test_hw_multigrid_mixed_48.cc @@ -395,7 +395,7 @@ public: Geometry geom; GridBase *Coarse5D; GridBase *Coarse4D; - CartesianStencil Stencil; + CartesianStencil Stencil; CoarsenedMatrix &Dw; GridBase * Grid(void) { return Coarse5D; }; // this is all the linalg routines need to know @@ -409,7 +409,7 @@ public: Coarse5D(&CoarseGrid5), Dw(_Dw), geom(CoarseGrid5._ndimension), - Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,0) + Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,DefaultImplParams()) { }; @@ -981,9 +981,9 @@ int main (int argc, char ** argv) RealD mass=0.00078; - WilsonFermionR Dw(Umu,*UGrid,*UrbGrid,-M5); - DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); - DomainWallFermionR Dpv (Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5); + WilsonFermionD Dw(Umu,*UGrid,*UrbGrid,-M5); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionD Dpv (Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5); typedef Aggregation Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -994,21 +994,21 @@ int main (int argc, char ** argv) std::cout< MdagM_Dw(Dw_null); + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.75); // 600 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.80); // 800 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.82); // 1023 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.85); // 1428 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.87); // 1900 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.90); // 3900 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.92); // 6200 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.94); // 8882 iters + WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.95); // 9170 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.96); // 8882 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.97); // 8406 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.99); // 6900 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-1.01); // 6397 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-1.00); // 5900 iters + MdagMLinearOperator MdagM_Dw(Dw_null); std::cout< Level1Op4; typedef CoarseCayleyFermion Level1Op5; Level1Op4 c_Dw (*Coarse4d,0); - NonHermitianLinearOperator LinOpDw(Dw); + NonHermitianLinearOperator LinOpDw(Dw); c_Dw.CoarsenOperator(UGrid,LinOpDw,Aggregates4D); // contains the M5 from Dw(-M5) // c_Dw.Test(Aggregates4D,UGrid,LinOpDw); @@ -1127,8 +1127,8 @@ int main (int argc, char ** argv) ConjugateGradient CoarseCG(tol,MaxIt); ConjugateGradient FineCG(tol,MaxIt); - NonHermitianLinearOperator FineM(Ddwf); - MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + NonHermitianLinearOperator FineM(Ddwf); + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M NonHermitianLinearOperator CoarseM(c_Dwf); MdagMLinearOperator CoarseMdagM(c_Dwf); @@ -1233,39 +1233,39 @@ typedef HDCRPreconditioner,nbasisc,NormalEquations // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space - // ChebyshevSmoother FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s - // ChebyshevSmoother FineSmoother2(0.5,60.0,14,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,14,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s - // ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s + // ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s - // ChebyshevSmoother FineSmoother2(0.5,60.0,20,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,20,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s - // ChebyshevSmoother FineSmoother2(0.5,60.0,24,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s + // ChebyshevSmoother FineSmoother2(0.5,60.0,24,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish - // ChebyshevSmoother FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26) + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish + // ChebyshevSmoother FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26) - // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s - // ChebyshevSmoother FineSmoother2(0.5,60.0,18,FineM,Ddwf); // + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,18,FineM,Ddwf); // - ChebyshevSmoother FineSmoother1(0.5,60.0,16,FineM,Ddwf); - ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); // + ChebyshevSmoother FineSmoother1(0.5,60.0,16,FineM,Ddwf); + ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); // - // ChebyshevSmoother FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s - // ChebyshevSmoother FineSmoother2(0.5,60.0,11,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s - // ChebyshevSmoother FineSmoother2(0.5,60.0,12,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.4,60.0,12,FineM,Ddwf); // iter 26 no change in final residual - // ChebyshevSmoother FineSmoother2(0.4,60.0,12,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s. - // ChebyshevSmoother FineSmoother2(0.3,60.0,12,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower - // ChebyshevSmoother FineSmoother2(0.3,60.0,13,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower - // ChebyshevSmoother FineSmoother2(1.0,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s + // ChebyshevSmoother FineSmoother2(0.5,60.0,11,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s + // ChebyshevSmoother FineSmoother2(0.5,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.4,60.0,12,FineM,Ddwf); // iter 26 no change in final residual + // ChebyshevSmoother FineSmoother2(0.4,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s. + // ChebyshevSmoother FineSmoother2(0.3,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower + // ChebyshevSmoother FineSmoother2(0.3,60.0,13,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower + // ChebyshevSmoother FineSmoother2(1.0,60.0,12,FineM,Ddwf); ThreeLevelMG ThreeLevelPrecon(Aggregates4D, FineM, diff --git a/tests/solver/Test_hw_multigrid_mixed_48_rb.cc b/tests/solver/Test_hw_multigrid_mixed_48_rb.cc index e7ceb022..0f18893e 100644 --- a/tests/solver/Test_hw_multigrid_mixed_48_rb.cc +++ b/tests/solver/Test_hw_multigrid_mixed_48_rb.cc @@ -395,7 +395,7 @@ public: Geometry geom; GridBase *Coarse5D; GridBase *Coarse4D; - CartesianStencil Stencil; + CartesianStencil Stencil; CoarsenedMatrix &Dw; GridBase * Grid(void) { return Coarse5D; }; // this is all the linalg routines need to know @@ -409,7 +409,7 @@ public: Coarse5D(&CoarseGrid5), Dw(_Dw), geom(CoarseGrid5._ndimension), - Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,0) + Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,DefaultImplParams()) { }; @@ -1005,9 +1005,9 @@ int main (int argc, char ** argv) RealD mass=0.00078; - WilsonFermionR Dw(Umu,*UGrid,*UrbGrid,-M5); - DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); - DomainWallFermionR Dpv (Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5); + WilsonFermionD Dw(Umu,*UGrid,*UrbGrid,-M5); + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionD Dpv (Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5); typedef Aggregation Subspace; typedef CoarsenedMatrix CoarseOperator; @@ -1018,21 +1018,21 @@ int main (int argc, char ** argv) std::cout< MdagM_Dw(Dw_null); + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.75); // 600 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.80); // 800 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.82); // 1023 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.85); // 1428 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.87); // 1900 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.90); // 3900 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.92); // 6200 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.94); // 8882 iters + WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.95); // 9170 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.96); // 8882 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.97); // 8406 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-0.99); // 6900 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-1.01); // 6397 iters + // WilsonFermionD Dw_null(Umu,*UGrid,*UrbGrid,-1.00); // 5900 iters + MdagMLinearOperator MdagM_Dw(Dw_null); std::cout< Level1Op4; typedef CoarseCayleyFermion Level1Op5; Level1Op4 c_Dw (*Coarse4d,0); - NonHermitianLinearOperator LinOpDw(Dw); + NonHermitianLinearOperator LinOpDw(Dw); c_Dw.CoarsenOperator(UGrid,LinOpDw,Aggregates4D); // contains the M5 from Dw(-M5) // c_Dw.Test(Aggregates4D,UGrid,LinOpDw); @@ -1148,8 +1148,8 @@ int main (int argc, char ** argv) ConjugateGradient CoarseCG(tol,MaxIt); ConjugateGradient FineCG(tol,MaxIt); - NonHermitianLinearOperator FineM(Ddwf); - MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + NonHermitianLinearOperator FineM(Ddwf); + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M NonHermitianLinearOperator CoarseM(c_Dwf); MdagMLinearOperator CoarseMdagM(c_Dwf); @@ -1272,38 +1272,38 @@ typedef HDCRPreconditioner,nbasisc,LinearFunction< // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space - // ChebyshevSmoother FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s - // ChebyshevSmoother FineSmoother2(0.5,60.0,14,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,14,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s - // ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s + // ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s - // ChebyshevSmoother FineSmoother2(0.5,60.0,20,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,20,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s - // ChebyshevSmoother FineSmoother2(0.5,60.0,24,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s + // ChebyshevSmoother FineSmoother2(0.5,60.0,24,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish - // ChebyshevSmoother FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26) + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish + // ChebyshevSmoother FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26) - // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s - // ChebyshevSmoother FineSmoother2(0.5,60.0,18,FineM,Ddwf); // + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,18,FineM,Ddwf); // - ChebyshevSmoother FineSmoother(f_lo,f_hi,f_ord,FineM,Ddwf); + ChebyshevSmoother FineSmoother(f_lo,f_hi,f_ord,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s - // ChebyshevSmoother FineSmoother2(0.5,60.0,11,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s - // ChebyshevSmoother FineSmoother2(0.5,60.0,12,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.4,60.0,12,FineM,Ddwf); // iter 26 no change in final residual - // ChebyshevSmoother FineSmoother2(0.4,60.0,12,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s. - // ChebyshevSmoother FineSmoother2(0.3,60.0,12,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower - // ChebyshevSmoother FineSmoother2(0.3,60.0,13,FineM,Ddwf); - // ChebyshevSmoother FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower - // ChebyshevSmoother FineSmoother2(1.0,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s + // ChebyshevSmoother FineSmoother2(0.5,60.0,11,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s + // ChebyshevSmoother FineSmoother2(0.5,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.4,60.0,12,FineM,Ddwf); // iter 26 no change in final residual + // ChebyshevSmoother FineSmoother2(0.4,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s. + // ChebyshevSmoother FineSmoother2(0.3,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower + // ChebyshevSmoother FineSmoother2(0.3,60.0,13,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower + // ChebyshevSmoother FineSmoother2(1.0,60.0,12,FineM,Ddwf); ThreeLevelMG ThreeLevelPrecon(Aggregates4D, FineM, diff --git a/tests/solver/Test_mobius_bcg.cc b/tests/solver/Test_mobius_bcg.cc index 8092d61c..a54b4a05 100644 --- a/tests/solver/Test_mobius_bcg.cc +++ b/tests/solver/Test_mobius_bcg.cc @@ -33,9 +33,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename MobiusFermionR::FermionField FermionField; - typedef typename MobiusFermionR::ComplexField ComplexField; - typename MobiusFermionR::ImplParams params; + typedef typename MobiusFermionD::FermionField FermionField; + typedef typename MobiusFermionD::ComplexField ComplexField; + typename MobiusFermionD::ImplParams params; const int Ls=12; @@ -158,15 +158,15 @@ int main (int argc, char ** argv) RealD mobius_factor=32./12.; RealD mobius_b=0.5*(mobius_factor+1.); RealD mobius_c=0.5*(mobius_factor-1.); - MobiusFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,mobius_b,mobius_c,params); - MobiusFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,mobius_b,mobius_c,params); + MobiusFermionD Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,mobius_b,mobius_c,params); + MobiusFermionD Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,mobius_b,mobius_c,params); std::cout << GridLogMessage << "****************************************************************** "< HermOp(Ddwf); - MdagMLinearOperator HermOpCk(Dchk); + MdagMLinearOperator HermOp(Ddwf); + MdagMLinearOperator HermOpCk(Dchk); ConjugateGradient CG((stp),100000); s_res = Zero(); diff --git a/tests/solver/Test_mobius_bcg_nosplit.cc b/tests/solver/Test_mobius_bcg_nosplit.cc index de02b1e3..f33a40ea 100644 --- a/tests/solver/Test_mobius_bcg_nosplit.cc +++ b/tests/solver/Test_mobius_bcg_nosplit.cc @@ -35,9 +35,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename DomainWallFermionR::FermionField FermionField; - typedef typename DomainWallFermionR::ComplexField ComplexField; - typename DomainWallFermionR::ImplParams params; + typedef typename DomainWallFermionD::FermionField FermionField; + typedef typename DomainWallFermionD::ComplexField ComplexField; + typename DomainWallFermionD::ImplParams params; const int Ls=16; @@ -106,13 +106,13 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " Building the solvers"< HermOp(Ddwf); + MdagMLinearOperator HermOp(Ddwf); ConjugateGradient CG((stp),100000); for(int rhs=0;rhs<1;rhs++){ diff --git a/tests/solver/Test_mobius_bcg_phys_nosplit.cc b/tests/solver/Test_mobius_bcg_phys_nosplit.cc index 2fe573ce..76a6f7e1 100644 --- a/tests/solver/Test_mobius_bcg_phys_nosplit.cc +++ b/tests/solver/Test_mobius_bcg_phys_nosplit.cc @@ -35,9 +35,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename DomainWallFermionR::FermionField FermionField; - typedef typename DomainWallFermionR::ComplexField ComplexField; - typename DomainWallFermionR::ImplParams params; + typedef typename DomainWallFermionD::FermionField FermionField; + typedef typename DomainWallFermionD::ComplexField ComplexField; + typename DomainWallFermionD::ImplParams params; const int Ls=16; @@ -107,7 +107,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " Building the solvers"< HermOp(Ddwf); + MdagMLinearOperator HermOp(Ddwf); ConjugateGradient CG((stp),100000); for(int rhs=0;rhs<1;rhs++){ diff --git a/tests/solver/Test_mobius_bcg_prec_nosplit.cc b/tests/solver/Test_mobius_bcg_prec_nosplit.cc index 3ac0d42b..f793893c 100644 --- a/tests/solver/Test_mobius_bcg_prec_nosplit.cc +++ b/tests/solver/Test_mobius_bcg_prec_nosplit.cc @@ -35,9 +35,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename DomainWallFermionR::FermionField FermionField; - typedef typename DomainWallFermionR::ComplexField ComplexField; - typename DomainWallFermionR::ImplParams params; + typedef typename DomainWallFermionD::FermionField FermionField; + typedef typename DomainWallFermionD::ComplexField ComplexField; + typename DomainWallFermionD::ImplParams params; const int Ls=16; @@ -106,13 +106,13 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " Building the solvers"< HermOp(Ddwf); + MdagMLinearOperator HermOp(Ddwf); ConjugateGradient CG((stp),100000); for(int rhs=0;rhs<1;rhs++){ diff --git a/tests/solver/Test_split_grid.cc b/tests/solver/Test_split_grid.cc index 85626c8e..39441c82 100644 --- a/tests/solver/Test_split_grid.cc +++ b/tests/solver/Test_split_grid.cc @@ -34,9 +34,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename DomainWallFermionR::FermionField FermionField; - typedef typename DomainWallFermionR::ComplexField ComplexField; - typename DomainWallFermionR::ImplParams params; + typedef typename DomainWallFermionD::FermionField FermionField; + typedef typename DomainWallFermionD::ComplexField ComplexField; + typename DomainWallFermionD::ImplParams params; const int Ls=4; @@ -117,15 +117,15 @@ int main (int argc, char ** argv) /////////////////////////////////////////////////////////////// RealD mass=0.01; RealD M5=1.8; - DomainWallFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5); - DomainWallFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5); + DomainWallFermionD Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5); + DomainWallFermionD Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5); std::cout << GridLogMessage << "****************************************************************** "< HermOp(Ddwf); - MdagMLinearOperator HermOpCk(Dchk); + MdagMLinearOperator HermOp(Ddwf); + MdagMLinearOperator HermOpCk(Dchk); ConjugateGradient CG((1.0e-8/(me+1)),10000); s_res = Zero(); CG(HermOp,s_src,s_res); diff --git a/tests/solver/Test_staggered_block_cg_prec.cc b/tests/solver/Test_staggered_block_cg_prec.cc index c5306e85..4cb7801e 100644 --- a/tests/solver/Test_staggered_block_cg_prec.cc +++ b/tests/solver/Test_staggered_block_cg_prec.cc @@ -46,9 +46,9 @@ struct scal { int main (int argc, char ** argv) { - typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; - typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; - typename ImprovedStaggeredFermion5DR::ImplParams params; + typedef typename ImprovedStaggeredFermion5DD::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DD::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DD::ImplParams params; const int Ls=8; @@ -98,8 +98,8 @@ int main (int argc, char ** argv) RealD c1=9.0/8.0; RealD c2=-1.0/24.0; RealD u0=1.0; - ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); - SchurStaggeredOperator HermOp(Ds); + ImprovedStaggeredFermion5DD Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); + SchurStaggeredOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); int blockDim = 0; @@ -111,8 +111,8 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "****************************************************************** "< HermOp4d(Ds4d); + ImprovedStaggeredFermionD Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0); + SchurStaggeredOperator HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d); FermionField result4d_o(UrbGrid); @@ -135,7 +135,6 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "************************************************************************ "< using namespace std; using namespace Grid; - ; - -template -struct scal { - d internal; -}; Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, @@ -46,9 +40,9 @@ struct scal { int main (int argc, char ** argv) { - typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; - typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; - typename ImprovedStaggeredFermion5DR::ImplParams params; + typedef typename ImprovedStaggeredFermion5DD::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DD::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DD::ImplParams params; const int Ls=8; @@ -83,8 +77,8 @@ int main (int argc, char ** argv) volume=volume*latt_size[mu]; } - ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); - MdagMLinearOperator HermOp(Ds); + ImprovedStaggeredFermion5DD Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); + MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); int blockDim = 0; @@ -95,8 +89,8 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "****************************************************************** "< HermOp4d(Ds4d); + ImprovedStaggeredFermionD Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0); + MdagMLinearOperator HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); FermionField result4d(UGrid); result4d=Zero(); @@ -120,7 +114,6 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "************************************************************************ "< HermOp(Ds); + MdagMLinearOperator HermOp(Ds); CommunicationAvoidingGeneralisedMinimalResidual CAGMRES(1.0e-8, 10000, 25); CAGMRES(HermOp,src,result); diff --git a/tests/solver/Test_staggered_cg_prec.cc b/tests/solver/Test_staggered_cg_prec.cc index 854ef632..bc80da09 100644 --- a/tests/solver/Test_staggered_cg_prec.cc +++ b/tests/solver/Test_staggered_cg_prec.cc @@ -47,8 +47,8 @@ struct scal { int main (int argc, char ** argv) { - typedef typename ImprovedStaggeredFermionR::FermionField FermionField; - typename ImprovedStaggeredFermionR::ImplParams params; + typedef typename ImprovedStaggeredFermionD::FermionField FermionField; + typename ImprovedStaggeredFermionD::ImplParams params; Grid_init(&argc,&argv); @@ -74,14 +74,14 @@ int main (int argc, char ** argv) RealD c1=9.0/8.0; RealD c2=-1.0/24.0; RealD u0=1.0; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); + ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); FermionField res_o(&RBGrid); FermionField src_o(&RBGrid); pickCheckerboard(Odd,src_o,src); res_o=Zero(); - SchurStaggeredOperator HermOpEO(Ds); + SchurStaggeredOperator HermOpEO(Ds); ConjugateGradient CG(1.0e-8,10000); double t1=usecond(); CG(HermOpEO,src_o,res_o); diff --git a/tests/solver/Test_staggered_cg_schur.cc b/tests/solver/Test_staggered_cg_schur.cc index d8e5bdd4..5d7d073e 100644 --- a/tests/solver/Test_staggered_cg_schur.cc +++ b/tests/solver/Test_staggered_cg_schur.cc @@ -45,8 +45,8 @@ struct scal { int main (int argc, char ** argv) { - typedef typename ImprovedStaggeredFermionR::FermionField FermionField; - typename ImprovedStaggeredFermionR::ImplParams params; + typedef typename ImprovedStaggeredFermionD::FermionField FermionField; + typename ImprovedStaggeredFermionD::ImplParams params; Grid_init(&argc,&argv); Coordinate latt_size = GridDefaultLatt(); @@ -68,7 +68,7 @@ int main (int argc, char ** argv) RealD c1=9.0/8.0; RealD c2=-1.0/24.0; RealD u0=1.0; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); + ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); ConjugateGradient CG(1.0e-8,10000); SchurRedBlackStaggeredSolve SchurSolver(CG); diff --git a/tests/solver/Test_staggered_cg_unprec.cc b/tests/solver/Test_staggered_cg_unprec.cc index e023b910..466f1d04 100644 --- a/tests/solver/Test_staggered_cg_unprec.cc +++ b/tests/solver/Test_staggered_cg_unprec.cc @@ -47,9 +47,9 @@ struct scal { int main (int argc, char ** argv) { - typedef typename ImprovedStaggeredFermionR::FermionField FermionField; - typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; - typename ImprovedStaggeredFermionR::ImplParams params; + typedef typename ImprovedStaggeredFermionD::FermionField FermionField; + typedef typename ImprovedStaggeredFermionD::ComplexField ComplexField; + typename ImprovedStaggeredFermionD::ImplParams params; Grid_init(&argc,&argv); @@ -76,9 +76,9 @@ int main (int argc, char ** argv) RealD c1=9.0/8.0; RealD c2=-1.0/24.0; RealD u0=1.0; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); + ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); - MdagMLinearOperator HermOp(Ds); + MdagMLinearOperator HermOp(Ds); ConjugateGradient CG(1.0e-6,10000); CG(HermOp,src,result); diff --git a/tests/solver/Test_staggered_fcagmres_prec.cc b/tests/solver/Test_staggered_fcagmres_prec.cc index 692d688e..7587748e 100644 --- a/tests/solver/Test_staggered_fcagmres_prec.cc +++ b/tests/solver/Test_staggered_fcagmres_prec.cc @@ -33,9 +33,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename ImprovedStaggeredFermionR::FermionField FermionField; - typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; - typename ImprovedStaggeredFermionR::ImplParams params; + typedef typename ImprovedStaggeredFermionD::FermionField FermionField; + typedef typename ImprovedStaggeredFermionD::ComplexField ComplexField; + typename ImprovedStaggeredFermionD::ImplParams params; Grid_init(&argc,&argv); @@ -62,9 +62,9 @@ int main (int argc, char ** argv) RealD c1=9.0/8.0; RealD c2=-1.0/24.0; RealD u0=1.0; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); + ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); - MdagMLinearOperator HermOp(Ds); + MdagMLinearOperator HermOp(Ds); TrivialPrecon simple; diff --git a/tests/solver/Test_staggered_fgmres_prec.cc b/tests/solver/Test_staggered_fgmres_prec.cc index fe6da67c..a3c65057 100644 --- a/tests/solver/Test_staggered_fgmres_prec.cc +++ b/tests/solver/Test_staggered_fgmres_prec.cc @@ -33,9 +33,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename ImprovedStaggeredFermionR::FermionField FermionField; - typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; - typename ImprovedStaggeredFermionR::ImplParams params; + typedef typename ImprovedStaggeredFermionD::FermionField FermionField; + typedef typename ImprovedStaggeredFermionD::ComplexField ComplexField; + typename ImprovedStaggeredFermionD::ImplParams params; Grid_init(&argc,&argv); @@ -62,9 +62,9 @@ int main (int argc, char ** argv) RealD c1=9.0/8.0; RealD c2=-1.0/24.0; RealD u0=1.0; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); + ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); - MdagMLinearOperator HermOp(Ds); + MdagMLinearOperator HermOp(Ds); TrivialPrecon simple; diff --git a/tests/solver/Test_staggered_gmres_unprec.cc b/tests/solver/Test_staggered_gmres_unprec.cc index ec9d4608..abfeab75 100644 --- a/tests/solver/Test_staggered_gmres_unprec.cc +++ b/tests/solver/Test_staggered_gmres_unprec.cc @@ -33,9 +33,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename ImprovedStaggeredFermionR::FermionField FermionField; - typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; - typename ImprovedStaggeredFermionR::ImplParams params; + typedef typename ImprovedStaggeredFermionD::FermionField FermionField; + typedef typename ImprovedStaggeredFermionD::ComplexField ComplexField; + typename ImprovedStaggeredFermionD::ImplParams params; Grid_init(&argc,&argv); @@ -62,9 +62,9 @@ int main (int argc, char ** argv) RealD c1=9.0/8.0; RealD c2=-1.0/24.0; RealD u0=1.0; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); + ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); - MdagMLinearOperator HermOp(Ds); + MdagMLinearOperator HermOp(Ds); GeneralisedMinimalResidual GMRES(1.0e-8, 10000, 25); GMRES(HermOp,src,result); diff --git a/tests/solver/Test_staggered_mr_unprec.cc b/tests/solver/Test_staggered_mr_unprec.cc index ddbb8de3..1cdd60f9 100644 --- a/tests/solver/Test_staggered_mr_unprec.cc +++ b/tests/solver/Test_staggered_mr_unprec.cc @@ -33,9 +33,9 @@ using namespace Grid; int main (int argc, char ** argv) { - typedef typename ImprovedStaggeredFermionR::FermionField FermionField; - typedef typename ImprovedStaggeredFermionR::ComplexField ComplexField; - typename ImprovedStaggeredFermionR::ImplParams params; + typedef typename ImprovedStaggeredFermionD::FermionField FermionField; + typedef typename ImprovedStaggeredFermionD::ComplexField ComplexField; + typename ImprovedStaggeredFermionD::ImplParams params; Grid_init(&argc,&argv); @@ -62,9 +62,9 @@ int main (int argc, char ** argv) RealD c1=9.0/8.0; RealD c2=-1.0/24.0; RealD u0=1.0; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); + ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); - MdagMLinearOperator HermOp(Ds); + MdagMLinearOperator HermOp(Ds); MinimalResidual MR(1.0e-8,10000,0.8); MR(HermOp,src,result); diff --git a/tests/solver/Test_staggered_multishift.cc b/tests/solver/Test_staggered_multishift.cc index 856f0b87..9f6b37d6 100644 --- a/tests/solver/Test_staggered_multishift.cc +++ b/tests/solver/Test_staggered_multishift.cc @@ -46,8 +46,8 @@ struct scal { int main (int argc, char ** argv) { - typedef typename ImprovedStaggeredFermionR::FermionField FermionField; - typename ImprovedStaggeredFermionR::ImplParams params; + typedef typename ImprovedStaggeredFermionD::FermionField FermionField; + typename ImprovedStaggeredFermionD::ImplParams params; Grid_init(&argc,&argv); @@ -90,8 +90,8 @@ int main (int argc, char ** argv) RealD c2=-1.0/24.0; RealD u0=1.0; - ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); - SchurStaggeredOperator HermOpEO(Ds); + ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0); + SchurStaggeredOperator HermOpEO(Ds); FermionField src(&Grid); random(pRNG,src); FermionField src_o(&RBGrid); diff --git a/tests/solver/Test_wilson_cagmres_unprec.cc b/tests/solver/Test_wilson_cagmres_unprec.cc index 226d0719..80381a27 100644 --- a/tests/solver/Test_wilson_cagmres_unprec.cc +++ b/tests/solver/Test_wilson_cagmres_unprec.cc @@ -55,9 +55,9 @@ int main (int argc, char ** argv) } RealD mass=0.5; - WilsonFermionR Dw(Umu,Grid,RBGrid,mass); + WilsonFermionD Dw(Umu,Grid,RBGrid,mass); - MdagMLinearOperator HermOp(Dw); + MdagMLinearOperator HermOp(Dw); CommunicationAvoidingGeneralisedMinimalResidual CAGMRES(1.0e-8, 10000, 25); CAGMRES(HermOp,src,result); diff --git a/tests/solver/Test_wilson_cg_prec.cc b/tests/solver/Test_wilson_cg_prec.cc index a28e014e..cb480e8c 100644 --- a/tests/solver/Test_wilson_cg_prec.cc +++ b/tests/solver/Test_wilson_cg_prec.cc @@ -69,7 +69,7 @@ int main (int argc, char ** argv) } RealD mass=0.5; - WilsonFermionR Dw(Umu,Grid,RBGrid,mass); + WilsonFermionD Dw(Umu,Grid,RBGrid,mass); // HermitianOperator HermOp(Dw); // ConjugateGradient CG(1.0e-8,10000); @@ -80,7 +80,7 @@ int main (int argc, char ** argv) pickCheckerboard(Odd,src_o,src); result_o=Zero(); - SchurDiagMooeeOperator HermOpEO(Dw); + SchurDiagMooeeOperator HermOpEO(Dw); ConjugateGradient CG(1.0e-8,10000); CG(HermOpEO,src_o,result_o); diff --git a/tests/solver/Test_wilson_cg_schur.cc b/tests/solver/Test_wilson_cg_schur.cc index 97482131..601eb6b2 100644 --- a/tests/solver/Test_wilson_cg_schur.cc +++ b/tests/solver/Test_wilson_cg_schur.cc @@ -64,7 +64,7 @@ int main (int argc, char ** argv) LatticeFermion resid(&Grid); RealD mass=0.5; - WilsonFermionR Dw(Umu,Grid,RBGrid,mass); + WilsonFermionD Dw(Umu,Grid,RBGrid,mass); ConjugateGradient CG(1.0e-8,10000); SchurRedBlackDiagMooeeSolve SchurSolver(CG); diff --git a/tests/solver/Test_wilson_cg_unprec.cc b/tests/solver/Test_wilson_cg_unprec.cc index 07f6ba7b..f1ecebd3 100644 --- a/tests/solver/Test_wilson_cg_unprec.cc +++ b/tests/solver/Test_wilson_cg_unprec.cc @@ -68,9 +68,9 @@ int main (int argc, char ** argv) } RealD mass=0.5; - WilsonFermionR Dw(Umu,Grid,RBGrid,mass); + WilsonFermionD Dw(Umu,Grid,RBGrid,mass); - MdagMLinearOperator HermOp(Dw); + MdagMLinearOperator HermOp(Dw); ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src,result); diff --git a/tests/solver/Test_wilson_cr_unprec.cc b/tests/solver/Test_wilson_cr_unprec.cc index 67510a23..a8b49afd 100644 --- a/tests/solver/Test_wilson_cr_unprec.cc +++ b/tests/solver/Test_wilson_cr_unprec.cc @@ -70,9 +70,9 @@ int main (int argc, char ** argv) } RealD mass=0.5; - WilsonFermionR Dw(Umu,Grid,RBGrid,mass); + WilsonFermionD Dw(Umu,Grid,RBGrid,mass); - MdagMLinearOperator HermOp(Dw); + MdagMLinearOperator HermOp(Dw); ConjugateResidual MCR(1.0e-8,10000); diff --git a/tests/solver/Test_wilson_fcagmres_prec.cc b/tests/solver/Test_wilson_fcagmres_prec.cc index d2a1acf4..66f9f518 100644 --- a/tests/solver/Test_wilson_fcagmres_prec.cc +++ b/tests/solver/Test_wilson_fcagmres_prec.cc @@ -55,9 +55,9 @@ int main (int argc, char ** argv) } RealD mass=0.5; - WilsonFermionR Dw(Umu,Grid,RBGrid,mass); + WilsonFermionD Dw(Umu,Grid,RBGrid,mass); - MdagMLinearOperator HermOp(Dw); + MdagMLinearOperator HermOp(Dw); TrivialPrecon simple; diff --git a/tests/solver/Test_wilson_fgmres_prec.cc b/tests/solver/Test_wilson_fgmres_prec.cc index 02d8f9f2..61368636 100644 --- a/tests/solver/Test_wilson_fgmres_prec.cc +++ b/tests/solver/Test_wilson_fgmres_prec.cc @@ -55,9 +55,9 @@ int main (int argc, char ** argv) } RealD mass=0.5; - WilsonFermionR Dw(Umu,Grid,RBGrid,mass); + WilsonFermionD Dw(Umu,Grid,RBGrid,mass); - MdagMLinearOperator HermOp(Dw); + MdagMLinearOperator HermOp(Dw); TrivialPrecon simple; diff --git a/tests/solver/Test_wilson_gmres_unprec.cc b/tests/solver/Test_wilson_gmres_unprec.cc index e52c047f..5f2728ce 100644 --- a/tests/solver/Test_wilson_gmres_unprec.cc +++ b/tests/solver/Test_wilson_gmres_unprec.cc @@ -55,9 +55,9 @@ int main (int argc, char ** argv) } RealD mass=0.5; - WilsonFermionR Dw(Umu,Grid,RBGrid,mass); + WilsonFermionD Dw(Umu,Grid,RBGrid,mass); - MdagMLinearOperator HermOp(Dw); + MdagMLinearOperator HermOp(Dw); GeneralisedMinimalResidual GMRES(1.0e-8, 10000, 25); GMRES(HermOp,src,result); diff --git a/tests/solver/Test_wilson_mg.cc b/tests/solver/Test_wilson_mg.cc index 99a16e19..875bf32a 100644 --- a/tests/solver/Test_wilson_mg.cc +++ b/tests/solver/Test_wilson_mg.cc @@ -77,16 +77,16 @@ int main(int argc, char **argv) { // Note: We do chiral doubling, so actually only nbasis/2 full basis vectors are used const int nbasis = 40; - WilsonFermionR Dw(Umu, *FGrid, *FrbGrid, mass); + WilsonFermionD Dw(Umu, *FGrid, *FrbGrid, mass); - MdagMLinearOperator MdagMOpDw(Dw); + MdagMLinearOperator MdagMOpDw(Dw); std::cout << GridLogMessage << "**************************************************" << std::endl; std::cout << GridLogMessage << "Testing Multigrid for Wilson" << std::endl; std::cout << GridLogMessage << "**************************************************" << std::endl; TrivialPrecon TrivialPrecon; - auto MGPreconDw = createMGInstance(mgParams, levelInfo, Dw, Dw); + auto MGPreconDw = createMGInstance(mgParams, levelInfo, Dw, Dw); MGPreconDw->setup(); diff --git a/tests/solver/Test_wilson_mr_unprec.cc b/tests/solver/Test_wilson_mr_unprec.cc index fef83794..c71392e4 100644 --- a/tests/solver/Test_wilson_mr_unprec.cc +++ b/tests/solver/Test_wilson_mr_unprec.cc @@ -55,9 +55,9 @@ int main (int argc, char ** argv) } RealD mass=0.5; - WilsonFermionR Dw(Umu,Grid,RBGrid,mass); + WilsonFermionD Dw(Umu,Grid,RBGrid,mass); - MdagMLinearOperator HermOp(Dw); + MdagMLinearOperator HermOp(Dw); MinimalResidual MR(1.0e-8,10000,0.8); MR(HermOp,src,result); diff --git a/tests/solver/Test_wilson_qmr_unprec.cc b/tests/solver/Test_wilson_qmr_unprec.cc index c0b42a28..0cd132e4 100644 --- a/tests/solver/Test_wilson_qmr_unprec.cc +++ b/tests/solver/Test_wilson_qmr_unprec.cc @@ -56,9 +56,9 @@ int main (int argc, char ** argv) QuasiMinimalResidual QMR(1.0e-8,10000); RealD mass=0.0; - WilsonFermionR Dw(Umu,*Grid,*rbGrid,mass); + WilsonFermionD Dw(Umu,*Grid,*rbGrid,mass); - NonHermitianLinearOperator NonHermOp(Dw); + NonHermitianLinearOperator NonHermOp(Dw); QMR(NonHermOp,src,result); Grid_finalize(); diff --git a/tests/solver/Test_wilsonclover_bicgstab_prec.cc b/tests/solver/Test_wilsonclover_bicgstab_prec.cc index b382b1bb..d265e687 100644 --- a/tests/solver/Test_wilsonclover_bicgstab_prec.cc +++ b/tests/solver/Test_wilsonclover_bicgstab_prec.cc @@ -70,14 +70,14 @@ int main (int argc, char ** argv) RealD mass = -0.1; RealD csw_r = 1.0; RealD csw_t = 1.0; - WilsonCloverFermionR Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); + WilsonCloverFermionD Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); LatticeFermion src_o(&RBGrid); LatticeFermion result_o(&RBGrid); pickCheckerboard(Odd, src_o, src); result_o = Zero(); - NonHermitianSchurDiagMooeeOperator HermOp(Dw); + NonHermitianSchurDiagMooeeOperator HermOp(Dw); BiCGSTAB CG(1.0e-8,10000); CG(HermOp, src_o, result_o); diff --git a/tests/solver/Test_wilsonclover_bicgstab_schur.cc b/tests/solver/Test_wilsonclover_bicgstab_schur.cc index f09d7cd1..38bfdb72 100644 --- a/tests/solver/Test_wilsonclover_bicgstab_schur.cc +++ b/tests/solver/Test_wilsonclover_bicgstab_schur.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) RealD mass = -0.1; RealD csw_r = 1.0; RealD csw_t = 1.0; - WilsonCloverFermionR Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); + WilsonCloverFermionD Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); BiCGSTAB CG(1.0e-8,10000); NonHermitianSchurRedBlackDiagMooeeSolve SchurSolver(CG); diff --git a/tests/solver/Test_wilsonclover_bicgstab_unprec.cc b/tests/solver/Test_wilsonclover_bicgstab_unprec.cc index f546a744..48f194b0 100644 --- a/tests/solver/Test_wilsonclover_bicgstab_unprec.cc +++ b/tests/solver/Test_wilsonclover_bicgstab_unprec.cc @@ -70,9 +70,9 @@ int main (int argc, char ** argv) RealD mass = -0.1; RealD csw_r = 1.0; RealD csw_t = 1.0; - WilsonCloverFermionR Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); + WilsonCloverFermionD Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); - NonHermitianLinearOperator HermOp(Dw); + NonHermitianLinearOperator HermOp(Dw); BiCGSTAB CG(1.0e-8,10000); CG(HermOp,src,result); diff --git a/tests/solver/Test_wilsonclover_cagmres_unprec.cc b/tests/solver/Test_wilsonclover_cagmres_unprec.cc index a8818168..8b264139 100644 --- a/tests/solver/Test_wilsonclover_cagmres_unprec.cc +++ b/tests/solver/Test_wilsonclover_cagmres_unprec.cc @@ -44,8 +44,8 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - typedef typename WilsonCloverFermionR::FermionField FermionField; - typename WilsonCloverFermionR::ImplParams params; + typedef typename WilsonCloverFermionD::FermionField FermionField; + typename WilsonCloverFermionD::ImplParams params; WilsonAnisotropyCoefficients anis; FermionField src(&Grid); random(pRNG,src); @@ -61,9 +61,9 @@ int main (int argc, char ** argv) RealD mass = 0.5; RealD csw_r = 1.0; RealD csw_t = 1.0; - WilsonCloverFermionR Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params); + WilsonCloverFermionD Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params); - MdagMLinearOperator HermOp(Dwc); + MdagMLinearOperator HermOp(Dwc); CommunicationAvoidingGeneralisedMinimalResidual CAGMRES(1.0e-8, 10000, 25); CAGMRES(HermOp,src,result); diff --git a/tests/solver/Test_wilsonclover_cg_prec.cc b/tests/solver/Test_wilsonclover_cg_prec.cc index abf64a1f..a0b3a712 100644 --- a/tests/solver/Test_wilsonclover_cg_prec.cc +++ b/tests/solver/Test_wilsonclover_cg_prec.cc @@ -72,10 +72,10 @@ int main (int argc, char ** argv) RealD csw_r = 1.0; RealD csw_t = 1.0; RealD cF = 1.0; - WilsonCloverFermionR Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); - CompactWilsonCloverFermionR Dw_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); - WilsonExpCloverFermionR Dwe(Umu, Grid, RBGrid, mass, csw_r, csw_t); - CompactWilsonExpCloverFermionR Dwe_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); + WilsonCloverFermionD Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); + CompactWilsonCloverFermionD Dw_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); + WilsonExpCloverFermionD Dwe(Umu, Grid, RBGrid, mass, csw_r, csw_t); + CompactWilsonExpCloverFermionD Dwe_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); // HermitianOperator HermOp(Dw); @@ -89,22 +89,22 @@ int main (int argc, char ** argv) ConjugateGradient CG(1.0e-8,10000); std::cout << GridLogMessage << "Testing Wilson Clover" << std::endl; - SchurDiagMooeeOperator HermOpEO(Dw); + SchurDiagMooeeOperator HermOpEO(Dw); result_o=Zero(); CG(HermOpEO,src_o,result_o); std::cout << GridLogMessage << "Testing Compact Wilson Clover" << std::endl; - SchurDiagMooeeOperator HermOpEO_compact(Dw_compact); + SchurDiagMooeeOperator HermOpEO_compact(Dw_compact); result_o=Zero(); CG(HermOpEO_compact,src_o,result_o); std::cout << GridLogMessage << "Testing Wilson Exp Clover" << std::endl; - SchurDiagMooeeOperator HermOpEO_exp(Dwe); + SchurDiagMooeeOperator HermOpEO_exp(Dwe); result_o=Zero(); CG(HermOpEO_exp,src_o,result_o); std::cout << GridLogMessage << "Testing Compact Wilson Exp Clover" << std::endl; - SchurDiagMooeeOperator HermOpEO_exp_compact(Dwe_compact); + SchurDiagMooeeOperator HermOpEO_exp_compact(Dwe_compact); result_o=Zero(); CG(HermOpEO_exp_compact,src_o,result_o); diff --git a/tests/solver/Test_wilsonclover_cg_schur.cc b/tests/solver/Test_wilsonclover_cg_schur.cc index 50d06af7..50a1c4a6 100644 --- a/tests/solver/Test_wilsonclover_cg_schur.cc +++ b/tests/solver/Test_wilsonclover_cg_schur.cc @@ -72,22 +72,22 @@ int main (int argc, char ** argv) RealD cF = 1.0; std::cout << GridLogMessage << "Testing Wilson Clover" << std::endl; - WilsonCloverFermionR Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); + WilsonCloverFermionD Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); result=Zero(); SchurSolver(Dw,src,result); std::cout << GridLogMessage << "Testing Compact Wilson Clover" << std::endl; - CompactWilsonCloverFermionR Dw_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); + CompactWilsonCloverFermionD Dw_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); result=Zero(); SchurSolver(Dw_compact,src,result); std::cout << GridLogMessage << "Testing Wilson Exp Clover" << std::endl; - WilsonExpCloverFermionR Dwe(Umu, Grid, RBGrid, mass, csw_r, csw_t); + WilsonExpCloverFermionD Dwe(Umu, Grid, RBGrid, mass, csw_r, csw_t); result=Zero(); SchurSolver(Dwe,src,result); std::cout << GridLogMessage << "Testing Compact Wilson Exp Clover" << std::endl; - CompactWilsonExpCloverFermionR Dwe_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); + CompactWilsonExpCloverFermionD Dwe_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); result=Zero(); SchurSolver(Dwe_compact,src,result); diff --git a/tests/solver/Test_wilsonclover_cg_unprec.cc b/tests/solver/Test_wilsonclover_cg_unprec.cc index 2a859f11..25cf07ee 100644 --- a/tests/solver/Test_wilsonclover_cg_unprec.cc +++ b/tests/solver/Test_wilsonclover_cg_unprec.cc @@ -71,31 +71,31 @@ int main (int argc, char ** argv) RealD csw_r = 1.0; RealD csw_t = 1.0; RealD cF = 1.0; - WilsonCloverFermionR Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); - CompactWilsonCloverFermionR Dw_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); - WilsonExpCloverFermionR Dwe(Umu, Grid, RBGrid, mass, csw_r, csw_t); - CompactWilsonExpCloverFermionR Dwe_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); + WilsonCloverFermionD Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t); + CompactWilsonCloverFermionD Dw_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); + WilsonExpCloverFermionD Dwe(Umu, Grid, RBGrid, mass, csw_r, csw_t); + CompactWilsonExpCloverFermionD Dwe_compact(Umu, Grid, RBGrid, mass, csw_r, csw_t, 0.0); ConjugateGradient CG(1.0e-8,10000); std::cout << GridLogMessage << "Testing Wilson Clover" << std::endl; - MdagMLinearOperator HermOp(Dw); + MdagMLinearOperator HermOp(Dw); result=Zero(); CG(HermOp,src,result); std::cout << GridLogMessage << "Testing Compact Wilson Clover" << std::endl; - MdagMLinearOperator HermOp_compact(Dw_compact); + MdagMLinearOperator HermOp_compact(Dw_compact); result=Zero(); CG(HermOp_compact,src,result); std::cout << GridLogMessage << "Testing Wilson Exp Clover" << std::endl; - MdagMLinearOperator HermOp_exp(Dwe); + MdagMLinearOperator HermOp_exp(Dwe); result=Zero(); CG(HermOp_exp,src,result); std::cout << GridLogMessage << "Testing Compact Wilson Exp Clover" << std::endl; - MdagMLinearOperator HermOp_exp_compact(Dwe_compact); + MdagMLinearOperator HermOp_exp_compact(Dwe_compact); result=Zero(); CG(HermOp_exp_compact,src,result); diff --git a/tests/solver/Test_wilsonclover_fcagmres_prec.cc b/tests/solver/Test_wilsonclover_fcagmres_prec.cc index 1a294821..77b2afff 100644 --- a/tests/solver/Test_wilsonclover_fcagmres_prec.cc +++ b/tests/solver/Test_wilsonclover_fcagmres_prec.cc @@ -44,8 +44,8 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - typedef typename WilsonCloverFermionR::FermionField FermionField; - typename WilsonCloverFermionR::ImplParams params; + typedef typename WilsonCloverFermionD::FermionField FermionField; + typename WilsonCloverFermionD::ImplParams params; WilsonAnisotropyCoefficients anis; FermionField src(&Grid); random(pRNG,src); @@ -61,9 +61,9 @@ int main (int argc, char ** argv) RealD mass = 0.5; RealD csw_r = 1.0; RealD csw_t = 1.0; - WilsonCloverFermionR Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params); + WilsonCloverFermionD Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params); - MdagMLinearOperator HermOp(Dwc); + MdagMLinearOperator HermOp(Dwc); TrivialPrecon simple; diff --git a/tests/solver/Test_wilsonclover_fgmres_prec.cc b/tests/solver/Test_wilsonclover_fgmres_prec.cc index 15bb4136..0f48871f 100644 --- a/tests/solver/Test_wilsonclover_fgmres_prec.cc +++ b/tests/solver/Test_wilsonclover_fgmres_prec.cc @@ -44,8 +44,8 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - typedef typename WilsonCloverFermionR::FermionField FermionField; - typename WilsonCloverFermionR::ImplParams params; + typedef typename WilsonCloverFermionD::FermionField FermionField; + typename WilsonCloverFermionD::ImplParams params; WilsonAnisotropyCoefficients anis; FermionField src(&Grid); random(pRNG,src); @@ -61,9 +61,9 @@ int main (int argc, char ** argv) RealD mass = 0.5; RealD csw_r = 1.0; RealD csw_t = 1.0; - WilsonCloverFermionR Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params); + WilsonCloverFermionD Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params); - MdagMLinearOperator HermOp(Dwc); + MdagMLinearOperator HermOp(Dwc); TrivialPrecon simple; diff --git a/tests/solver/Test_wilsonclover_gmres_unprec.cc b/tests/solver/Test_wilsonclover_gmres_unprec.cc index 00f33382..b660d716 100644 --- a/tests/solver/Test_wilsonclover_gmres_unprec.cc +++ b/tests/solver/Test_wilsonclover_gmres_unprec.cc @@ -44,8 +44,8 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - typedef typename WilsonCloverFermionR::FermionField FermionField; - typename WilsonCloverFermionR::ImplParams params; + typedef typename WilsonCloverFermionD::FermionField FermionField; + typename WilsonCloverFermionD::ImplParams params; WilsonAnisotropyCoefficients anis; FermionField src(&Grid); random(pRNG,src); @@ -61,9 +61,9 @@ int main (int argc, char ** argv) RealD mass = 0.5; RealD csw_r = 1.0; RealD csw_t = 1.0; - WilsonCloverFermionR Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params); + WilsonCloverFermionD Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params); - MdagMLinearOperator HermOp(Dwc); + MdagMLinearOperator HermOp(Dwc); GeneralisedMinimalResidual GMRES(1.0e-8, 10000, 25); GMRES(HermOp,src,result); diff --git a/tests/solver/Test_wilsonclover_mg.cc b/tests/solver/Test_wilsonclover_mg.cc index 605d225d..1b0e8bb7 100644 --- a/tests/solver/Test_wilsonclover_mg.cc +++ b/tests/solver/Test_wilsonclover_mg.cc @@ -80,16 +80,16 @@ int main(int argc, char **argv) { // Note: We do chiral doubling, so actually only nbasis/2 full basis vectors are used const int nbasis = 40; - WilsonCloverFermionR Dwc(Umu, *FGrid, *FrbGrid, mass, csw_r, csw_t); + WilsonCloverFermionD Dwc(Umu, *FGrid, *FrbGrid, mass, csw_r, csw_t); - MdagMLinearOperator MdagMOpDwc(Dwc); + MdagMLinearOperator MdagMOpDwc(Dwc); std::cout << GridLogMessage << "**************************************************" << std::endl; std::cout << GridLogMessage << "Testing Multigrid for Wilson Clover" << std::endl; std::cout << GridLogMessage << "**************************************************" << std::endl; TrivialPrecon TrivialPrecon; - auto MGPreconDwc = createMGInstance(mgParams, levelInfo, Dwc, Dwc); + auto MGPreconDwc = createMGInstance(mgParams, levelInfo, Dwc, Dwc); MGPreconDwc->setup(); diff --git a/tests/solver/Test_wilsonclover_mr_unprec.cc b/tests/solver/Test_wilsonclover_mr_unprec.cc index ab49ec1f..be721236 100644 --- a/tests/solver/Test_wilsonclover_mr_unprec.cc +++ b/tests/solver/Test_wilsonclover_mr_unprec.cc @@ -44,8 +44,8 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - typedef typename WilsonCloverFermionR::FermionField FermionField; - typename WilsonCloverFermionR::ImplParams params; + typedef typename WilsonCloverFermionD::FermionField FermionField; + typename WilsonCloverFermionD::ImplParams params; WilsonAnisotropyCoefficients anis; FermionField src(&Grid); random(pRNG,src); @@ -61,9 +61,9 @@ int main (int argc, char ** argv) RealD mass = 0.5; RealD csw_r = 1.0; RealD csw_t = 1.0; - WilsonCloverFermionR Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params); + WilsonCloverFermionD Dwc(Umu,Grid,RBGrid,mass,csw_r,csw_t,anis,params); - MdagMLinearOperator HermOp(Dwc); + MdagMLinearOperator HermOp(Dwc); MinimalResidual MR(1.0e-8,10000,0.8); MR(HermOp,src,result); diff --git a/tests/solver/Test_zmobius_cg_prec.cc b/tests/solver/Test_zmobius_cg_prec.cc index 6b007afc..7f1f98b8 100644 --- a/tests/solver/Test_zmobius_cg_prec.cc +++ b/tests/solver/Test_zmobius_cg_prec.cc @@ -101,7 +101,7 @@ int main(int argc, char** argv) { omegas.push_back( std::complex(0.0686324988446592,-0.0550658530827402) ); #endif - ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, omegas,1.,0.); + ZMobiusFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, omegas,1.,0.); LatticeFermion src_o(FrbGrid); LatticeFermion result_o(FrbGrid); @@ -110,7 +110,7 @@ int main(int argc, char** argv) { GridStopWatch CGTimer; - SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO(Ddwf); ConjugateGradient CG(1.0e-8, 10000, 0);// switch off the assert CGTimer.Start(); @@ -121,7 +121,6 @@ int main(int argc, char** argv) { << std::endl; std::cout << GridLogMessage << "######## Dhop calls summary" << std::endl; - Ddwf.Report(); Grid_finalize(); } From a5c77f8b95c0759469c7abd32b6cf80dd3bf72bf Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Nov 2022 00:40:27 -0500 Subject: [PATCH 190/240] Tracing moved in order --- Grid/GridCore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/GridCore.h b/Grid/GridCore.h index 8e04a859..41c64ef6 100644 --- a/Grid/GridCore.h +++ b/Grid/GridCore.h @@ -44,10 +44,10 @@ Author: paboyle #include #include #include -#include //#include #include #include +#include #include #include #include From bc9579dac6cb5e2babfe7682abb95d5ae7aa5519 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Nov 2022 00:40:45 -0500 Subject: [PATCH 191/240] Old code path removed --- Grid/algorithms/approx/Chebyshev.h | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 7c93f0b8..1d6984f3 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -258,26 +258,12 @@ public: for(int n=2;nView(); - auto Tnp_v = Tnp->View(); - auto Tnm_v = Tnm->View(); - constexpr int Nsimd = vector_type::Nsimd(); - accelerator_for(ss, in.Grid()->oSites(), Nsimd, { - coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); - coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); - }); - if ( Coeffs[n] != 0.0) { - axpy(out,Coeffs[n],*Tnp,out); - } -#else axpby(y,xscale,mscale,y,(*Tn)); axpby(*Tnp,2.0,-1.0,y,(*Tnm)); if ( Coeffs[n] != 0.0) { axpy(out,Coeffs[n],*Tnp,out); } -#endif + // Cycle pointers to avoid copies Field *swizzle = Tnm; Tnm =Tn; From c0fb20fc039870673e9ca3643fa07ca25ccb808d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Nov 2022 00:43:12 -0500 Subject: [PATCH 192/240] Audit check for wrongly locked data --- Grid/allocator/MemoryManagerCache.cc | 55 ++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 3bb3db7e..f03ee79f 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -8,9 +8,8 @@ NAMESPACE_BEGIN(Grid); static char print_buffer [ MAXLINE ]; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; -//#define dprintf(...) printf (__VA_ARGS__ ); fflush(stdout); -#define dprintf(...) - +#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; +//#define dprintf(...) //////////////////////////////////////////////////////////// @@ -132,9 +131,11 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - mprintf("MemoryManager: Evict(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); - assert(AccCache.accLock==0); - assert(AccCache.cpuLock==0); + mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n", + (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, + (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); + if (AccCache.accLock!=0) return; + if (AccCache.cpuLock!=0) return; if(AccCache.state==AccDirty) { Flush(AccCache); } @@ -197,6 +198,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::ViewClose(void* Ptr,ViewMode mode) { if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr); } else if( (mode==CpuRead)||(mode==CpuWrite)){ CpuViewClose((uint64_t)Ptr); @@ -208,6 +210,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis { uint64_t CpuPtr = (uint64_t)_CpuPtr; if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); } else if( (mode==CpuRead)||(mode==CpuWrite)){ return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); @@ -247,11 +250,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { - dprintf("ViewOpen found entry %lx %lx : %ld %ld\n", + dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr, (uint64_t)CpuPtr, (uint64_t)AccCache.bytes, - (uint64_t)bytes); + (uint64_t)bytes, + (uint64_t)AccCache.accLock); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -286,6 +290,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // Empty + AccRead => Consistent } AccCache.accLock= 1; + dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); } else if(AccCache.state==CpuDirty ){ if(mode==AcceleratorWriteDiscard) { CpuDiscard(AccCache); @@ -298,21 +303,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); } else { assert(0); } @@ -320,6 +325,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // If view is opened on device remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU + dprintf("AccCache entry removed from LRU \n"); LRUremove(AccCache); } @@ -340,10 +346,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) assert(AccCache.accLock>0); AccCache.accLock--; - // Move to LRU queue if not locked and close on device if(AccCache.accLock==0) { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); LRUinsert(AccCache); + } else { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); } } void MemoryManager::CpuViewClose(uint64_t CpuPtr) @@ -479,6 +487,29 @@ int MemoryManager::isOpen (void* _CpuPtr) return 0; } } +void MemoryManager::Audit(std::string s) +{ + for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){ + auto &AccCache = it->second; + + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + + if ( AccCache.cpuLock || AccCache.accLock ) { + std::cout << GridLogError << s<< "\n\t 0x"< Date: Fri, 25 Nov 2022 00:43:57 -0500 Subject: [PATCH 193/240] Audit --- Grid/allocator/MemoryManagerShared.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index c072873b..2434ad47 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -13,6 +13,7 @@ uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; +void MemoryManager::Audit(void){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; int MemoryManager::isOpen (void* CpuPtr) { return 0;} From 7d8231ba32ce754a3b72cb38af556073f0450e33 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Nov 2022 00:44:57 -0500 Subject: [PATCH 194/240] Tracing --- Grid/log/Log.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Grid/log/Log.cc b/Grid/log/Log.cc index acccec0e..b54c368d 100644 --- a/Grid/log/Log.cc +++ b/Grid/log/Log.cc @@ -66,6 +66,7 @@ GridLogger GridLogError (1, "Error" , GridLogColours, "RED"); GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW"); GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL"); GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL"); +GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL"); GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE"); @@ -77,7 +78,7 @@ void GridLogConfigure(std::vector &logstreams) { GridLogError.Active(1); GridLogWarning.Active(0); GridLogMessage.Active(1); // at least the messages should be always on - GridLogMemory.Active(0); // at least the messages should be always on + GridLogMemory.Active(0); GridLogIterative.Active(0); GridLogDebug.Active(0); GridLogPerformance.Active(0); @@ -87,6 +88,7 @@ void GridLogConfigure(std::vector &logstreams) { GridLogHMC.Active(1); for (int i = 0; i < logstreams.size(); i++) { + if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1); if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1); if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1); if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0); @@ -94,8 +96,8 @@ void GridLogConfigure(std::vector &logstreams) { if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); - if (logstreams[i] == std::string("NoIntegrator")) GridLogIntegrator.Active(0); - if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); + if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0); + if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); } } From 63a30ae34f9be125c84c887037855a2ccca1b72a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Nov 2022 00:45:05 -0500 Subject: [PATCH 195/240] Tracing --- Grid/log/Log.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/log/Log.h b/Grid/log/Log.h index fd706771..2d663a3c 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -186,6 +186,7 @@ extern GridLogger GridLogIterative ; extern GridLogger GridLogIntegrator ; extern GridLogger GridLogHMC; extern GridLogger GridLogMemory; +extern GridLogger GridLogTracing; extern Colours GridLogColours; std::string demangle(const char* name) ; From 121c9e2cebbd0b277fd5ca36e6238591a4779613 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Nov 2022 00:45:21 -0500 Subject: [PATCH 196/240] Tracing --- Grid/perfmon/Tracing.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Grid/perfmon/Tracing.h b/Grid/perfmon/Tracing.h index d39350c9..5000cef4 100644 --- a/Grid/perfmon/Tracing.h +++ b/Grid/perfmon/Tracing.h @@ -1,4 +1,7 @@ #pragma once + +NAMESPACE_BEGIN(Grid); + #ifdef GRID_TRACING_NVTX #include class GridTracer { @@ -64,3 +67,4 @@ inline void traceStop(int ID) { } #else #define GRID_TRACE(name) GridTracer uniq_name_using_macros##__COUNTER__(name); #endif +NAMESPACE_END(Grid); From d71672dca939ede2459ba4cc25d6a40c3ff9e513 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Nov 2022 00:46:35 -0500 Subject: [PATCH 197/240] Bug fix --- Grid/qcd/action/fermion/WilsonCompressor.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 9ecb1c49..89aeca61 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -61,6 +61,7 @@ public: if(s==0) compress.Compress(buffer[off+i ],rhs_v[so+table_v[idx].second]); if(s==Ls-1) compress.Compress(buffer[off+i+vol],rhs_v[so+table_v[idx].second]); }); + rhs_v.ViewClose(); } template static void DecompressFace(decompressor decompress,Decompression &dd) @@ -171,6 +172,7 @@ public: compressor &compress, int off,int so,int partial) { + // std::cout << " face gather simple DWF partial "< pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { + // std::cout << " face gather exch DWF partial "< Date: Fri, 25 Nov 2022 00:47:01 -0500 Subject: [PATCH 198/240] Audit --- Grid/allocator/MemoryManager.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 740d8d92..ad2d9ffc 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -36,6 +36,11 @@ NAMESPACE_BEGIN(Grid); #define GRID_ALLOC_SMALL_LIMIT (4096) +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) +#define FILE_LINE __FILE__ ":" TOSTRING(__LINE__) +#define AUDIT(a) MemoryManager::Audit(FILE_LINE) + /*Pinning pages is costly*/ //////////////////////////////////////////////////////////////////////////// // Advise the LatticeAccelerator class @@ -94,6 +99,7 @@ private: static void PrintBytes(void); public: + static void Audit(std::string s); static void Init(void); static void InitMessage(void); static void *AcceleratorAllocate(size_t bytes); From f6402cb6c4aa31b33ecfc661befe009efe089165 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 25 Nov 2022 00:50:33 -0500 Subject: [PATCH 199/240] AUDIT removal --- .../WilsonFermion5DImplementation.h | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 992f4d20..4ca24789 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -96,6 +96,8 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, Coordinate block = p.dirichlet; if ( block[0] || block[1] || block[2] || block[3] || block[4] ){ Dirichlet = 1; + std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl; + std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl; Block = block; } } else { @@ -137,9 +139,6 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, StencilEven.BuildSurfaceList(LLs,vol4); StencilOdd.BuildSurfaceList(LLs,vol4); - // std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() - // <<" " << StencilEven.surface_list.size()< @@ -148,21 +147,29 @@ void WilsonFermion5D::ImportGauge(const GaugeField &_Umu) GaugeField HUmu(_Umu.Grid()); HUmu = _Umu*(-0.5); if ( Dirichlet ) { + + if ( this->Params.partialDirichlet ) { + std::cout << GridLogMessage << " partialDirichlet BCs " <LocalDimensions()[d]; if (GaugeBlock) assert( (GaugeBlock%ldim)==0); } - } - if ( Dirichlet && (!this->Params.partialDirichlet) ) { - std::cout << GridLogMessage << " Dirichlet filtering gauge field BCs block " < Filter(GaugeBlock); - Filter.applyFilter(HUmu); - } else { - std::cout << GridLogMessage << " Dirichlet "<< Dirichlet << " not filtered gauge field" <Params.partialDirichlet) { + std::cout << GridLogMessage << " Dirichlet filtering gauge field BCs block " < Filter(GaugeBlock); + Filter.applyFilter(HUmu); + } else { + std::cout << GridLogMessage << " Dirichlet "<< Dirichlet << " NOT filtered gauge field" < Date: Fri, 25 Nov 2022 00:51:04 -0500 Subject: [PATCH 200/240] partial send fix --- Grid/stencil/Stencil.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index da230e7e..cffede12 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -203,7 +203,7 @@ class CartesianStencilAccelerator { template class CartesianStencilView : public CartesianStencilAccelerator { - private: +public: int *closed; StencilEntry *cpu_ptr; ViewMode mode; @@ -676,6 +676,8 @@ public: int block = dirichlet_block[dimension]; this->_comms_send[ii] = comm_dim; this->_comms_recv[ii] = comm_dim; + this->_comms_partial_send[ii] = 0; + this->_comms_partial_recv[ii] = 0; if ( block && comm_dim ) { assert(abs(displacement) < ld ); // Quiesce communication across block boundaries @@ -1131,6 +1133,7 @@ public: send_buf = this->u_send_buf_p; // Gather locally, must send assert(send_buf!=NULL); + // std::cout << " GatherPlaneSimple partial send "<< comms_partial_send< Date: Wed, 30 Nov 2022 15:51:13 -0500 Subject: [PATCH 201/240] Partial dirichlet changes --- Grid/log/Log.cc | 1 + Grid/qcd/action/ActionBase.h | 12 ++++--- Grid/qcd/action/fermion/WilsonCompressor.h | 35 +++++++++++++------ .../GeneralEvenOddRationalRatio.h | 1 + .../pseudofermion/TwoFlavourEvenOddRatio.h | 26 +++++++++++++- Grid/qcd/hmc/integrators/Integrator.h | 23 +++++++++++- HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc | 28 +++++++++++---- TODO | 3 ++ benchmarks/Benchmark_dwf_fp32_partial.cc | 12 ++++--- systems/Crusher/config-command | 1 + 10 files changed, 114 insertions(+), 28 deletions(-) diff --git a/Grid/log/Log.cc b/Grid/log/Log.cc index b54c368d..166aea0a 100644 --- a/Grid/log/Log.cc +++ b/Grid/log/Log.cc @@ -79,6 +79,7 @@ void GridLogConfigure(std::vector &logstreams) { GridLogWarning.Active(0); GridLogMessage.Active(1); // at least the messages should be always on GridLogMemory.Active(0); + GridLogTracing.Active(0); GridLogIterative.Active(0); GridLogDebug.Active(0); GridLogPerformance.Active(0); diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index 5027fcf4..1e8d6d7a 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -55,14 +55,18 @@ public: deriv_num=0; } void deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) { - deriv_max_sum+=max; + if ( max > deriv_max_sum ) { + deriv_max_sum=max; + } deriv_norm_sum+=nrm; - Fdt_max_sum+=Fdt_max; + if ( Fdt_max > Fdt_max_sum ) { + Fdt_max_sum=Fdt_max; + } Fdt_norm_sum+=Fdt_nrm; deriv_num++; } - RealD deriv_max_average(void) { return deriv_max_sum/deriv_num; }; + RealD deriv_max_average(void) { return deriv_max_sum; }; RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; }; - RealD Fdt_max_average(void) { return Fdt_max_sum/deriv_num; }; + RealD Fdt_max_average(void) { return Fdt_max_sum; }; RealD Fdt_norm_average(void) { return Fdt_norm_sum/deriv_num; }; RealD deriv_timer(void) { return deriv_us; }; RealD S_timer(void) { return S_us; }; diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 89aeca61..f62b3aba 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -36,10 +36,12 @@ NAMESPACE_BEGIN(Grid); // Wilson compressor will need FaceGather policies for: // Periodic, Dirichlet, and partial Dirichlet for DWF /////////////////////////////////////////////////////////////// +const int dwf_compressor_depth=2; class FaceGatherPartialDWF { public: - static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/2;}; + // static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; + static int PartialCompressionFactor(GridBase *grid) {return 1;} // static int PartialCompressionFactor(GridBase *grid) { return 1;} template static void Gather_plane_simple (commVector >& table, @@ -52,14 +54,17 @@ public: // Shrinks local and remote comms buffers GridBase *Grid = rhs.Grid(); int Ls = Grid->_rdimensions[0]; + // int depth=dwf_compressor_depth; + int depth=Ls/2; std::pair *table_v = & table[0]; auto rhs_v = rhs.View(AcceleratorRead); int vol=table.size()/Ls; accelerator_forNB( idx,table.size(), vobj::Nsimd(), { Integer i=idx/Ls; Integer s=idx%Ls; - if(s==0) compress.Compress(buffer[off+i ],rhs_v[so+table_v[idx].second]); - if(s==Ls-1) compress.Compress(buffer[off+i+vol],rhs_v[so+table_v[idx].second]); + Integer sc=depth+s-(Ls-depth); + if(s=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]); }); rhs_v.ViewClose(); } @@ -67,6 +72,8 @@ public: static void DecompressFace(decompressor decompress,Decompression &dd) { auto Ls = dd.dims[0]; + // int depth=dwf_compressor_depth; + int depth=Ls/2; // Just pass in the Grid auto kp = dd.kernel_p; auto mp = dd.mpi_p; @@ -75,11 +82,12 @@ public: accelerator_forNB(o,size,1,{ int idx=o/Ls; int s=o%Ls; - if ( s == 0 ) { - int oo=idx; + if ( s < depth ) { + int oo=s*vol+idx; kp[o]=mp[oo]; - } else if ( s == Ls-1 ) { - int oo=vol+idx; + } else if ( s >= Ls-depth ) { + int sc = depth + s - (Ls-depth); + int oo=sc*vol+idx; kp[o]=mp[oo]; } else { kp[o] = Zero();//fill rest with zero if partial dirichlet @@ -98,7 +106,9 @@ public: { GridBase *Grid = rhs.Grid(); int Ls = Grid->_rdimensions[0]; - + // int depth=dwf_compressor_depth; + int depth = Ls/2; + // insertion of zeroes... assert( (table.size()&0x1)==0); int num=table.size()/2; @@ -113,7 +123,7 @@ public: // Reorders both local and remote comms buffers // int s = j % Ls; - int sp1 = (s+1)%Ls; // peri incremented s slice + int sp1 = (s+depth)%Ls; // peri incremented s slice int hxyz= j/Ls; @@ -136,6 +146,8 @@ public: static void MergeFace(decompressor decompress,Merger &mm) { auto Ls = mm.dims[0]; + int depth = Ls/2; + // int depth=dwf_compressor_depth; int num= mm.buffer_size/2; // relate vol and Ls to buffer size auto mp = &mm.mpointer[0]; auto vp0= &mm.vpointers[0][0]; // First arg is exchange first @@ -149,7 +161,7 @@ public: int xyz0=hxyz*2; int xyz1=hxyz*2+1; - int sp = (s+1)%Ls; + int sp = (s+depth)%Ls; int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... int oo0= s+xyz0*Ls; @@ -163,7 +175,8 @@ public: class FaceGatherDWFMixedBCs { public: - static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/2;}; + // static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; + static int PartialCompressionFactor(GridBase *grid) {return 1;} template static void Gather_plane_simple (commVector >& table, diff --git a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h index ff605362..cb680f2f 100644 --- a/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/GeneralEvenOddRationalRatio.h @@ -279,6 +279,7 @@ NAMESPACE_BEGIN(Grid); // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi ////////////////////////////////////////////////////// virtual RealD Sinitial(const GaugeField &U) { + std::cout << GridLogMessage << "Returning stored two flavour refresh action "< { public: INHERIT_IMPL_TYPES(Impl); - + private: FermionOperator & NumOp;// the basic operator FermionOperator & DenOp;// the basic operator @@ -112,28 +112,48 @@ NAMESPACE_BEGIN(Grid); // NumOp == V // DenOp == M // + AUDIT(); FermionField etaOdd (NumOp.FermionRedBlackGrid()); FermionField etaEven(NumOp.FermionRedBlackGrid()); FermionField tmp (NumOp.FermionRedBlackGrid()); + AUDIT(); pickCheckerboard(Even,etaEven,eta); + AUDIT(); pickCheckerboard(Odd,etaOdd,eta); + AUDIT(); NumOp.ImportGauge(U); + AUDIT(); DenOp.ImportGauge(U); + std::cout << " TwoFlavourRefresh: Imported gauge "< Mpc(DenOp); + AUDIT(); SchurDifferentiableOperator Vpc(NumOp); + AUDIT(); + std::cout << " TwoFlavourRefresh: Diff ops "<is_smeared); double start_force = usecond(); + + std::cout << GridLogMessage << "AuditForce["<deriv_timer_start(); as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta as[level].actions.at(a)->deriv_timer_stop(); + std::cout << GridLogMessage << "AuditForce["<is_smeared << std::endl; auto name = as[level].actions.at(a)->action_name(); if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); @@ -284,7 +291,7 @@ public: for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { std::cout << GridLogMessage << as[level].actions.at(actionID)->action_name() - <<"["<deriv_max_average() <<" norm " << as[level].actions.at(actionID)->deriv_norm_average() <<" Fdt max " << as[level].actions.at(actionID)->Fdt_max_average() @@ -364,9 +371,14 @@ public: std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<is_smeared); + + std::cout << GridLogMessage << "AuditRefresh["<refresh_timer_start(); as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); as[level].actions.at(actionID)->refresh_timer_stop(); + std::cout << GridLogMessage << "AuditRefresh["<is_smeared); @@ -412,6 +425,7 @@ public: as[level].actions.at(actionID)->S_timer_stop(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; + AUDIT(); } as[level].apply(S_hireps, Representations, level, H); } @@ -424,7 +438,9 @@ public: void operator()(std::vector*> repr_set, Repr& Rep, int level, RealD& H) { for (int a = 0; a < repr_set.size(); ++a) { + AUDIT(); RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); + AUDIT(); std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; H += Hterm; @@ -449,8 +465,10 @@ public: Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; as[level].actions.at(actionID)->S_timer_start(); + AUDIT(); Hterm = as[level].actions.at(actionID)->Sinitial(Us); as[level].actions.at(actionID)->S_timer_stop(); + AUDIT(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; } @@ -463,6 +481,7 @@ public: void integrate(Field& U) { + AUDIT(); // reset the clocks t_U = 0; for (int level = 0; level < as.size(); ++level) { @@ -480,8 +499,10 @@ public: assert(fabs(t_U - t_P[level]) < 1.0e-6); // must be the same std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; } + AUDIT(); FieldImplementation::Project(U); + AUDIT(); // and that we indeed got to the end of the trajectory assert(fabs(t_U - Params.trajL) < 1.0e-6); diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc index f4ca8515..e85bec7e 100644 --- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc +++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc @@ -179,8 +179,11 @@ int main(int argc, char **argv) { MD.name = std::string("Force Gradient"); //typedef GenericHMCRunner HMCWrapper; // MD.name = std::string("MinimumNorm2"); - // MD.MDsteps = 4; - MD.MDsteps = 4; + // TrajL = 2 + // 4/2 => 0.6 dH + // 3/3 => ?? dH + //MD.MDsteps = 4; + MD.MDsteps = 3; MD.trajL = 0.5; HMCparameters HMCparams; @@ -223,7 +226,7 @@ int main(int argc, char **argv) { Real light_mass = 7.8e-4; Real strange_mass = 0.0362; Real pv_mass = 1.0; - std::vector hasenbusch({ light_mass, 3.8e-3, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + std::vector hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); @@ -327,6 +330,8 @@ int main(int argc, char **argv) { ParamsF.dirichlet=NonDirichlet; ParamsDir.dirichlet=Dirichlet; ParamsDirF.dirichlet=Dirichlet; + ParamsDir.partialDirichlet=1; + ParamsDirF.partialDirichlet=1; // double StoppingCondition = 1e-14; // double MDStoppingCondition = 1e-9; @@ -342,8 +347,8 @@ int main(int argc, char **argv) { // Collect actions //////////////////////////////////// ActionLevel Level1(1); - ActionLevel Level2(2); - ActionLevel Level3(30); + ActionLevel Level2(3); + ActionLevel Level3(15); //////////////////////////////////// // Strange action @@ -474,13 +479,21 @@ int main(int argc, char **argv) { if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet; else ParamsDen.dirichlet = NonDirichlet; + if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1; + else ParamsNum.partialDirichlet = 0; + + if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1; + else ParamsDen.partialDirichlet = 0; + Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum)); Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen)); ParamsDenF.dirichlet = ParamsDen.dirichlet; + ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet; DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF)); ParamsNumF.dirichlet = ParamsNum.dirichlet; + ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet; NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF)); LinOpD.push_back(new LinearOperatorD(*Denominators[h])); @@ -516,9 +529,11 @@ int main(int argc, char **argv) { FermionActionD2::ImplParams ParamsNumD2(boundary); ParamsDenD2.dirichlet = ParamsDen.dirichlet; + ParamsDenD2.partialDirichlet = ParamsDen.partialDirichlet; DenominatorsD2.push_back(new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenD2)); ParamsNumD2.dirichlet = ParamsNum.dirichlet; + ParamsNumD2.partialDirichlet = ParamsNum.partialDirichlet; NumeratorsD2.push_back (new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumD2)); Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction( @@ -543,7 +558,8 @@ int main(int argc, char **argv) { int nquo=Quotients.size(); Level1.push_back(Bdys[0]); Level1.push_back(Bdys[1]); - for(int h=0;h HIP and SYCL GPU code + + ====== DDHMC ====== diff --git a/benchmarks/Benchmark_dwf_fp32_partial.cc b/benchmarks/Benchmark_dwf_fp32_partial.cc index 4db3022a..0cbd4a6c 100644 --- a/benchmarks/Benchmark_dwf_fp32_partial.cc +++ b/benchmarks/Benchmark_dwf_fp32_partial.cc @@ -88,6 +88,7 @@ int main (int argc, char ** argv) // Node level ////////////////////// for(int d=0;d1 ? 1 : 0; + // for(int d=0;doSites();ss++){ for(int s=0;s=Ls-depth)){ tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s]; } else { tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s]; @@ -246,7 +247,7 @@ void Benchmark(int Ls, Coordinate Dirichlet, int partial) autoView( src_v, src , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;s=Ls-depth)){ tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s]; } else { tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s]; @@ -342,6 +343,7 @@ void Benchmark(int Ls, Coordinate Dirichlet, int partial) ref = Zero(); for(int mu=0;muoSites();ss++){ for(int s=0;s=Ls-depth)){ tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s]; } else { tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s]; @@ -365,7 +367,7 @@ void Benchmark(int Ls, Coordinate Dirichlet, int partial) autoView( src_v, src , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;s=Ls-depth)){ tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s]; } else { tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s]; diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index bd825ab3..3965767f 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -3,6 +3,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` --with-lime=$CLIME \ --enable-unified=no \ --enable-shm=nvlink \ +--enable-tracing=timer \ --enable-accelerator=hip \ --enable-gen-simd-width=64 \ --enable-simd=GPU \ From 82c1ecf60f8128c6399fcc86217e086aba934146 Mon Sep 17 00:00:00 2001 From: Chulwoo Jung Date: Wed, 30 Nov 2022 16:08:40 -0500 Subject: [PATCH 202/240] Block lanczos added --- .../ImplicitlyRestartedBlockLanczos.h | 1555 +++++++++++++++++ Grid/lattice/Lattice_rng.h | 10 + Grid/util/Init.cc | 3 +- Grid/util/Init.h | 2 +- tests/lanczos/Test_dwf_block_lanczos.README | 73 + tests/lanczos/Test_dwf_block_lanczos.cc | 408 +++++ .../lanczos/Test_dwf_block_lanczos.cc.double | 401 +++++ .../lanczos/Test_dwf_block_lanczos.cc.single | 408 +++++ tests/lanczos/Test_dwf_lanczos.cc | 93 +- 9 files changed, 2924 insertions(+), 29 deletions(-) create mode 100644 Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h create mode 100644 tests/lanczos/Test_dwf_block_lanczos.README create mode 100644 tests/lanczos/Test_dwf_block_lanczos.cc create mode 100644 tests/lanczos/Test_dwf_block_lanczos.cc.double create mode 100644 tests/lanczos/Test_dwf_block_lanczos.cc.single diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h new file mode 100644 index 00000000..fe388c19 --- /dev/null +++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h @@ -0,0 +1,1555 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h + + Copyright (C) 2015 + +Author: Peter Boyle +Author: Chulwoo Jung +Author: Yong-Chull Jang +Author: Guido Cossu + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef GRID_IRBL_H +#define GRID_IRBL_H + +#include //memset +#ifdef USE_LAPACK +#include +#endif + +#undef USE_LAPACK +#define Glog std::cout << GridLogMessage + +#ifdef GRID_CUDA +#include "cublas_v2.h" +#endif + +#if 0 +#define CUDA_COMPLEX cuDoubleComplex +#define CUDA_FLOAT double +#define MAKE_CUDA_COMPLEX make_cuDoubleComplex +#define CUDA_GEMM cublasZgemm +#else +#define CUDA_COMPLEX cuComplex +#define CUDA_FLOAT float +#define MAKE_CUDA_COMPLEX make_cuComplex +#define CUDA_GEMM cublasCgemm +#endif + +namespace Grid { + +//////////////////////////////////////////////////////////////////////////////// +// Helper class for sorting the evalues AND evectors by Field +// Use pointer swizzle on vectors SHOULD GET RID OF IT SOON! +//////////////////////////////////////////////////////////////////////////////// +template +class SortEigen { + private: + static bool less_lmd(RealD left,RealD right){ + return left > right; + } + static bool less_pair(std::pair& left, + std::pair& right){ + return left.first > (right.first); + } + + public: + void push(std::vector& lmd,std::vector& evec,int N) { + + //////////////////////////////////////////////////////////////////////// + // PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set. + // : The vector reorder should be done by pointer swizzle somehow + //////////////////////////////////////////////////////////////////////// + std::vector cpy(lmd.size(),evec[0].Grid()); + for(int i=0;i > emod(lmd.size()); + + for(int i=0;i(lmd[i],&cpy[i]); + + partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair); + + typename std::vector >::iterator it = emod.begin(); + for(int i=0;ifirst; + evec[i]=*(it->second); + ++it; + } + } + void push(std::vector& lmd,int N) { + std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd); + } + bool saturated(RealD lmd, RealD thrs) { + return fabs(lmd) > fabs(thrs); + } +}; + +enum class LanczosType { irbl, rbl }; + +enum IRBLdiagonalisation { + IRBLdiagonaliseWithDSTEGR, + IRBLdiagonaliseWithQR, + IRBLdiagonaliseWithEigen +}; + +///////////////////////////////////////////////////////////// +// Implicitly restarted block lanczos +///////////////////////////////////////////////////////////// +template +class ImplicitlyRestartedBlockLanczos { + +private: + + std::string cname = std::string("ImplicitlyRestartedBlockLanczos"); + int MaxIter; // Max iterations + int Nstop; // Number of evecs checked for convergence + int Nu; // Number of vecs in the unit block + int Nk; // Number of converged sought + int Nm; // total number of vectors + int Nblock_k; // Nk/Nu + int Nblock_m; // Nm/Nu + int Nconv_test_interval; // Number of skipped vectors when checking a convergence + RealD eresid; + IRBLdiagonalisation diagonalisation; + //////////////////////////////////// + // Embedded objects + //////////////////////////////////// + SortEigen _sort; + LinearOperatorBase &_Linop; + LinearOperatorBase &_SLinop;//for split + OperatorFunction &_poly; + GridRedBlackCartesian * f_grid; + GridRedBlackCartesian * sf_grid; + int mrhs; + ///////////////////////// + // BLAS objects + ///////////////////////// +#ifdef GRID_CUDA + cudaError_t cudaStat; + CUDA_COMPLEX *w_acc, *evec_acc, *c_acc; +#endif + int Nevec_acc; // Number of eigenvectors stored in the buffer evec_acc + + ///////////////////////// + // Constructor + ///////////////////////// +public: + int split_test; //test split in the first iteration + ImplicitlyRestartedBlockLanczos(LinearOperatorBase &Linop, // op + LinearOperatorBase &SLinop, // op + GridRedBlackCartesian * FrbGrid, + GridRedBlackCartesian * SFrbGrid, + int _mrhs, + OperatorFunction & poly, // polynomial + int _Nstop, // really sought vecs + int _Nconv_test_interval, // conv check interval + int _Nu, // vecs in the unit block + int _Nk, // sought vecs + int _Nm, // total vecs + RealD _eresid, // resid in lmd deficit + int _MaxIter, // Max iterations + IRBLdiagonalisation _diagonalisation = IRBLdiagonaliseWithEigen) + : _Linop(Linop), _SLinop(SLinop), _poly(poly),sf_grid(SFrbGrid),f_grid(FrbGrid), + Nstop(_Nstop), Nconv_test_interval(_Nconv_test_interval), mrhs(_mrhs), + Nu(_Nu), Nk(_Nk), Nm(_Nm), + Nblock_m(_Nm/_Nu), Nblock_k(_Nk/_Nu), + //eresid(_eresid), MaxIter(10), + eresid(_eresid), MaxIter(_MaxIter), + diagonalisation(_diagonalisation),split_test(0), + Nevec_acc(_Nu) + { assert( (Nk%Nu==0) && (Nm%Nu==0) ); }; + + //////////////////////////////// + // Helpers + //////////////////////////////// + static RealD normalize(Field& v, int if_print=0) + { + RealD nn = norm2(v); + nn = sqrt(nn); +#if 0 + if(if_print && nn < 1e20) + Glog<<"normalize: "<& evec, int k, int if_print=0) + { + typedef typename Field::scalar_type MyComplex; +// MyComplex ip; + ComplexD ip; + + for(int j=0; j 1e-14) + Glog<<"orthogonalize before: "< 1e-14) + Glog<<"orthogonalize after: "<& evec, int k) + { + orthogonalize(w, evec, k,1); + } + + void orthogonalize(std::vector& w, int _Nu, std::vector& evec, int k, int if_print=0) + { + typedef typename Field::scalar_type MyComplex; + MyComplex ip; +// ComplexD ip; + + for(int j=0; j 1e-14) + Glog<<"orthogonalize before: "< 1e-14) + Glog<<"orthogonalize after: "<& w, std::vector& evec, int R, int do_print=0) + { +#ifdef GRID_CUDA + Glog << "cuBLAS orthogonalize" << std::endl; + + typedef typename Field::vector_object vobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + typedef typename Field::scalar_type MyComplex; + + GridBase *grid = w[0].Grid(); + const uint64_t sites = grid->lSites(); + + int Nbatch = R/Nevec_acc; + assert( R%Nevec_acc == 0 ); +// Glog << "nBatch, Nevec_acc, R, Nu = " +// << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl; + +#if 0 // a trivial test + for (int col=0; col(&w_v[0]); +// Glog << "col= "<(&evec_v[0]); +// Glog << "col= "<[" << j << "," << i << "] = " + << z.x << " + i " << z.y << std::endl; + } + w[i] = w[i] - ip * evec[b*Nevec_acc+j]; + } + } +#else + alpha = MAKE_CUDA_COMPLEX(-1.0,0.0); + beta = MAKE_CUDA_COMPLEX(1.0,0.0); + stat = CUDA_GEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, 12*sites, Nu, Nevec_acc, + &alpha, + evec_acc, 12*sites, c_acc, Nevec_acc, + &beta, + w_acc, 12*sites); + //Glog << stat << std::endl; +#endif + } +#if 1 + for (int col=0; col(&w_v[0]); + for (size_t row=0; row &inner, std::vector& lhs, int llhs, std::vector& rhs, int lrhs) +{ + typedef typename Field:vector_object vobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + GridBase *grid = lhs[0]._grid; + assert(grid == rhs[0]._grid; + const int pad = 8; + int total = llhs*lrhs; + assert(inner.size()==total); + int sum_size=grid->SumArraySize(); + +// std::vector inner(total); + Vector sumarray(sum_size*pad*total); + + parallel_for(int thr=0;throSites(),thr,mywork,myoff); + + std::vector< decltype(innerProductD(lhs[0]._odata[0],rhs[0]._odata[0])) > vinner(total,zero); // private to thread; sub summation + for(int ss=myoff;ssGlobalSum(tmp); + inner[i]=tmp; + } +// return inner; +} +#endif + + + void orthogonalize_blockhead(Field& w, std::vector& evec, int k, int Nu) + { + typedef typename Field::scalar_type MyComplex; + MyComplex ip; + + for(int j=0; j& eval, + std::vector& evec, + const std::vector& src, int& Nconv, LanczosType Impl) + { +#ifdef GRID_CUDA + GridBase *grid = src[0].Grid(); + grid->show_decomposition(); + +// printf("GRID_CUDA\n"); + + // set eigenvector buffers for the cuBLAS calls + //const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites = grid->lSites(); + + cudaStat = cudaMallocManaged((void **)&w_acc, Nu*sites*12*sizeof(CUDA_COMPLEX)); +// Glog << "w_acc= "<& eval, + std::vector& evec, + const std::vector& src, int& Nconv) + { + std::string fname = std::string(cname+"::calc_irbl()"); + GridBase *grid = evec[0].Grid(); + assert(grid == src[0].Grid()); + assert( Nu = src.size() ); + + Glog << std::string(74,'*') << std::endl; + Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; + Glog << std::string(74,'*') << std::endl; + Glog <<" -- seek Nk = "<< Nk <<" vectors"<< std::endl; + Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl; + Glog <<" -- total Nm = "<< Nm <<" vectors"<< std::endl; + Glog <<" -- size of eval = "<< eval.size() << std::endl; + Glog <<" -- size of evec = "<< evec.size() << std::endl; + if ( diagonalisation == IRBLdiagonaliseWithEigen ) { + Glog << "Diagonalisation is Eigen "<< std::endl; +#ifdef USE_LAPACK + } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { + Glog << "Diagonalisation is LAPACK "<< std::endl; +#endif + } else { + abort(); + } + Glog << std::string(74,'*') << std::endl; + + assert(Nm == evec.size() && Nm == eval.size()); + + std::vector> lmd(Nu,std::vector(Nm,0.0)); + std::vector> lme(Nu,std::vector(Nm,0.0)); + std::vector> lmd2(Nu,std::vector(Nm,0.0)); + std::vector> lme2(Nu,std::vector(Nm,0.0)); + std::vector eval2(Nm); + std::vector resid(Nk); + + Eigen::MatrixXcd Qt = Eigen::MatrixXcd::Zero(Nm,Nm); + Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); + + std::vector Iconv(Nm); + std::vector B(Nm,grid); // waste of space replicating + + std::vector f(Nu,grid); + std::vector f_copy(Nu,grid); + Field v(grid); + + Nconv = 0; + + RealD beta_k; + + // set initial vector + for (int i=0; i& eval, + std::vector& evec, + const std::vector& src, int& Nconv) + { + std::string fname = std::string(cname+"::calc_rbl()"); + GridBase *grid = evec[0].Grid(); + assert(grid == src[0].Grid()); + assert( Nu = src.size() ); + + int Np = (Nm-Nk); + if (Np > 0 && MaxIter > 1) Np /= MaxIter; + int Nblock_p = Np/Nu; + for(int i=0;i< evec.size();i++) evec[0].Advise()=AdviseInfrequentUse; + + Glog << std::string(74,'*') << std::endl; + Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; + Glog << std::string(74,'*') << std::endl; + Glog <<" -- seek (min) Nk = "<< Nk <<" vectors"<< std::endl; + Glog <<" -- seek (inc) Np = "<< Np <<" vectors"<< std::endl; + Glog <<" -- seek (max) Nm = "<< Nm <<" vectors"<< std::endl; + Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl; + Glog <<" -- size of eval = "<< eval.size() << std::endl; + Glog <<" -- size of evec = "<< evec.size() << std::endl; + if ( diagonalisation == IRBLdiagonaliseWithEigen ) { + Glog << "Diagonalisation is Eigen "<< std::endl; +#ifdef USE_LAPACK + } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { + Glog << "Diagonalisation is LAPACK "<< std::endl; +#endif + } else { + abort(); + } + Glog << std::string(74,'*') << std::endl; + + assert(Nm == evec.size() && Nm == eval.size()); + + std::vector> lmd(Nu,std::vector(Nm,0.0)); + std::vector> lme(Nu,std::vector(Nm,0.0)); + std::vector> lmd2(Nu,std::vector(Nm,0.0)); + std::vector> lme2(Nu,std::vector(Nm,0.0)); + std::vector eval2(Nk); + std::vector resid(Nm); + + Eigen::MatrixXcd Qt = Eigen::MatrixXcd::Zero(Nm,Nm); + Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); + + std::vector Iconv(Nm); +// int Ntest=Nu; +// std::vector B(Nm,grid); // waste of space replicating + std::vector B(1,grid); // waste of space replicating + + std::vector f(Nu,grid); + std::vector f_copy(Nu,grid); + Field v(grid); + + Nconv = 0; + +// RealD beta_k; + + // set initial vector + for (int i=0; i Btmp(Nstop,grid); // waste of space replicating +#if 0 + for(int i=0; i>& lmd, + std::vector>& lme, + std::vector& evec, + std::vector& w, + std::vector& w_copy, + int b) + { + const RealD tiny = 1.0e-20; + + int Nu = w.size(); + int Nm = evec.size(); + assert( b < Nm/Nu ); +// GridCartesian *grid = evec[0]._grid; + + // converts block index to full indicies for an interval [L,R) + int L = Nu*b; + int R = Nu*(b+1); + + Real beta; + + Glog << "Using split grid"<< std::endl; +// LatticeGaugeField s_Umu(SGrid); + assert((Nu%mrhs)==0); + std::vector in(mrhs,f_grid); + + Field s_in(sf_grid); + Field s_out(sf_grid); + // unnecessary copy. Can or should it be avoided? + int k_start = 0; +while ( k_start < Nu) { + Glog << "k_start= "<0) { + for (int u=0; u0) { + for (int u=0; u0) { + // orthogonalize_blockhead(w[0],evec,b,Nu); + // for (int u=1; u& eval, + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nk, int Nm, + Eigen::MatrixXcd & Qt, // Nm x Nm + GridBase *grid) + { + assert( Nk%Nu == 0 && Nm%Nu == 0 ); + assert( Nk <= Nm ); + Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); + + for ( int u=0; u eigensolver(BlockTriDiag); + + for (int i = 0; i < Nk; i++) { + eval[Nk-1-i] = eigensolver.eigenvalues()(i); + } + for (int i = 0; i < Nk; i++) { + for (int j = 0; j < Nk; j++) { + Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i); + //Qt(Nk-1-i,j) = eigensolver.eigenvectors()(i,j); + //Qt(i,j) = eigensolver.eigenvectors()(i,j); + } + } + } + +#ifdef USE_LAPACK + void diagonalize_lapack(std::vector& eval, + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nk, int Nm, + Eigen::MatrixXcd & Qt, // Nm x Nm + GridBase *grid) + { + Glog << "diagonalize_lapack: Nu= "<_Nprocessors; + int node = grid->_processor; + int interval = (NN/total)+1; + double vl = 0.0, vu = 0.0; + MKL_INT il = interval*node+1 , iu = interval*(node+1); + if (iu > NN) iu=NN; + Glog << "node "<= il-1; i--){ + evals_tmp[i] = evals_tmp[i - (il-1)]; + if (il>1) evals_tmp[i-(il-1)]=0.; + for (int j = 0; j< NN; j++){ + evec_tmp[i*NN+j] = evec_tmp[(i - (il-1))*NN+j]; + if (il>1) { + evec_tmp[(i-(il-1))*NN+j].imag=0.; + evec_tmp[(i-(il-1))*NN+j].real=0.; + } + } + } + } + { + grid->GlobalSumVector(evals_tmp,NN); + grid->GlobalSumVector((double*)evec_tmp,2*NN*NN); + } + } + // Safer to sort instead of just reversing it, + // but the document of the routine says evals are sorted in increasing order. + // qr gives evals in decreasing order. +// for(int i=0;i + ( evec_tmp[i*Nk+j].real, + evec_tmp[i*Nk+j].imag); +// ( evec_tmp[(Nk-1-j)*Nk+Nk-1-i].real, +// evec_tmp[(Nk-1-j)*Nk+Nk-1-i].imag); + + } + } + +if (1){ + Eigen::SelfAdjointEigenSolver eigensolver(BlockTriDiag); + + for (int i = 0; i < Nk; i++) { + Glog << "eval = "<& eval, + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nk, int Nm, + Eigen::MatrixXcd & Qt, + GridBase *grid) + { + Qt = Eigen::MatrixXcd::Identity(Nm,Nm); + if ( diagonalisation == IRBLdiagonaliseWithEigen ) { + diagonalize_Eigen(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); +#ifdef USE_LAPACK + } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { + diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); +#endif + } else { + assert(0); + } + } + + + void unpackHermitBlockTriDiagMatToEigen( + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nb, int Nk, int Nm, + Eigen::MatrixXcd& M) + { + //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; + assert( Nk%Nu == 0 && Nm%Nu == 0 ); + assert( Nk <= Nm ); + M = Eigen::MatrixXcd::Zero(Nk,Nk); + + // rearrange + for ( int u=0; u>& lmd, + std::vector>& lme, + int Nu, int Nb, int Nk, int Nm, + Eigen::MatrixXcd& M) + { + //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; + assert( Nk%Nu == 0 && Nm%Nu == 0 ); + assert( Nk <= Nm ); + + // rearrange + for ( int u=0; u QRD(Mtmp); + Q = QRD.householderQ(); + R = QRD.matrixQR(); // upper triangular part is the R matrix. + // lower triangular part used to represent series + // of Q sequence. + + // equivalent operation of Qprod *= Q + //M = Eigen::MatrixXcd::Zero(Nm,Nm); + + //for (int i=0; i Nm) kmax = Nm; + for (int k=i; ki) M(i,j) = conj(M(j,i)); + // if (i-j > Nu || j-i > Nu) M(i,j) = 0.; + // } + //} + + //Glog << "shiftedQRDecompEigen() end" << endl; + } + + void exampleQRDecompEigen(void) + { + Eigen::MatrixXd A = Eigen::MatrixXd::Zero(3,3); + Eigen::MatrixXd Q = Eigen::MatrixXd::Zero(3,3); + Eigen::MatrixXd R = Eigen::MatrixXd::Zero(3,3); + Eigen::MatrixXd P = Eigen::MatrixXd::Zero(3,3); + + A(0,0) = 12.0; + A(0,1) = -51.0; + A(0,2) = 4.0; + A(1,0) = 6.0; + A(1,1) = 167.0; + A(1,2) = -68.0; + A(2,0) = -4.0; + A(2,1) = 24.0; + A(2,2) = -41.0; + + Glog << "matrix A before ColPivHouseholder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + + Eigen::ColPivHouseholderQR QRD(A); + + Glog << "matrix A after ColPivHouseholder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "HouseholderQ with sequence lenth = nonzeroPiviots" << std::endl; + Q = QRD.householderQ().setLength(QRD.nonzeroPivots()); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "HouseholderQ with sequence lenth = 1" << std::endl; + Q = QRD.householderQ().setLength(1); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "HouseholderQ with sequence lenth = 2" << std::endl; + Q = QRD.householderQ().setLength(2); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "matrixR" << std::endl; + R = QRD.matrixR(); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "R[" << i << "," << j << "] = " << R(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "rank = " << QRD.rank() << std::endl; + Glog << "threshold = " << QRD.threshold() << std::endl; + + Glog << "matrixP" << std::endl; + P = QRD.colsPermutation(); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "P[" << i << "," << j << "] = " << P(i,j) << '\n'; + } + } + Glog << std::endl; + + + Glog << "QR decomposition without column pivoting" << std::endl; + + A(0,0) = 12.0; + A(0,1) = -51.0; + A(0,2) = 4.0; + A(1,0) = 6.0; + A(1,1) = 167.0; + A(1,2) = -68.0; + A(2,0) = -4.0; + A(2,1) = 24.0; + A(2,2) = -41.0; + + Glog << "matrix A before Householder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + + Eigen::HouseholderQR QRDplain(A); + + Glog << "HouseholderQ" << std::endl; + Q = QRDplain.householderQ(); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "matrix A after Householder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + } + + }; +} +#undef Glog +#undef USE_LAPACK +#undef CUDA_COMPLEX +#undef CUDA_FLOAT +#undef MAKE_CUDA_COMPLEX +#undef CUDA_GEMM +#endif diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index 6857dc84..180b8437 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -440,7 +440,17 @@ public: _grid->GlobalCoorToGlobalIndex(gcoor,gidx); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); +#if 1 assert(rank == _grid->ThisRank() ); +#else +// + if (rank != _grid->ThisRank() ){ + std::cout <<"rank "<ThisRank() "<<_grid->ThisRank()<< std::endl; +// exit(-42); +// assert(0); + } +#endif + int l_idx=generator_idx(o_idx,i_idx); _generators[l_idx] = master_engine; diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index c3ac2424..d013763a 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -167,14 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val) return; } -void GridCmdOptionFloat(std::string &str,float & val) +void GridCmdOptionFloat(std::string &str,double & val) { std::stringstream ss(str); ss>>val; return; } - void GridParseLayout(char **argv,int argc, Coordinate &latt_c, Coordinate &mpi_c) diff --git a/Grid/util/Init.h b/Grid/util/Init.h index 585660a1..bdf0bcac 100644 --- a/Grid/util/Init.h +++ b/Grid/util/Init.h @@ -57,7 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector & vec); template void GridCmdOptionIntVector(const std::string &str,VectorInt & vec); void GridCmdOptionInt(std::string &str,int & val); -void GridCmdOptionFloat(std::string &str,float & val); +void GridCmdOptionFloat(std::string &str,double & val); void GridParseLayout(char **argv,int argc, diff --git a/tests/lanczos/Test_dwf_block_lanczos.README b/tests/lanczos/Test_dwf_block_lanczos.README new file mode 100644 index 00000000..179f9037 --- /dev/null +++ b/tests/lanczos/Test_dwf_block_lanczos.README @@ -0,0 +1,73 @@ +#Example script +DIR=/gpfs/alpine/phy157/proj-shared/phy157dwf/chulwoo/Grid/BL/build/tests/lanczos +BIN=${DIR}/Test_dwf_block_lanczos + +VOL='--grid 16.16.16.32 ' +GRID='--mpi 1.1.1.4 ' +CONF='--gconf ckpoint_lat.IEEE64BIG.2000 ' +OPT='--mass 0.01 --M5 1.8 --phase in.params --omega in.params --shm 4096' +#BL='--rbl 16.1024.128.1000.10 --split 1.1.4.4 --check_int 100 --resid 1.0e-5 --cheby_l 0.007 --cheby_u 7 --cheby_n 51' +BL='--rbl 4.128.16.100.10 --split 1.1.1.4 --check_int 25 --resid 1.0e-5 --cheby_l 0.007 --cheby_u 7 --cheby_n 51' + +ARGS=${CONF}" "${OPT}" "${BL}" "${VOL}" "${GRID} +export APP="${BIN} ${ARGS}" +echo APP=${APP} +#export JS="jsrun --nrs 32 -a4 -g4 -c42 -dpacked -b packed:7 --smpiargs="-gpu" " +export JS="jsrun --nrs 1 -a4 -g4 -c42 -dpacked -b packed:10 --smpiargs="-gpu" " +$JS $APP + +#sample in.param + +boundary_phase 0 1 0 +boundary_phase 1 1 0 +boundary_phase 2 1 0 +boundary_phase 3 -1 0 + +omega 0 0.5 0 +omega 1 0.5 0 +omega 2 0.5 0 +omega 3 0.5 0 +omega 4 0.5 0 +omega 5 0.5 0 +omega 6 0.5 0 +omega 7 0.5 0 +omega 8 0.5 0 +omega 9 0.5 0 +omega 10 0.5 0 +omega 11 0.5 0 + + +#output + +Grid : Message : 1.717474 s : Gauge Configuration ckpoint_lat.IEEE64BIG.2000 +Grid : Message : 1.717478 s : boundary_phase[0] = (1,0) +Grid : Message : 1.717497 s : boundary_phase[1] = (1,0) +Grid : Message : 1.717500 s : boundary_phase[2] = (1,0) +Grid : Message : 1.717503 s : boundary_phase[3] = (-1,0) +Grid : Message : 1.717506 s : Ls 12 +Grid : Message : 1.717507 s : mass 0.01 +Grid : Message : 1.717510 s : M5 1.8 +Grid : Message : 1.717512 s : mob_b 1.5 +Grid : Message : 1.717514 s : omega[0] = (0.5,0) +Grid : Message : 1.717517 s : omega[1] = (0.5,0) +Grid : Message : 1.717520 s : omega[2] = (0.5,0) +Grid : Message : 1.717523 s : omega[3] = (0.5,0) +Grid : Message : 1.717526 s : omega[4] = (0.5,0) +Grid : Message : 1.717529 s : omega[5] = (0.5,0) +Grid : Message : 1.717532 s : omega[6] = (0.5,0) +Grid : Message : 1.717535 s : omega[7] = (0.5,0) +Grid : Message : 1.717538 s : omega[8] = (0.5,0) +Grid : Message : 1.717541 s : omega[9] = (0.5,0) +Grid : Message : 1.717544 s : omega[10] = (0.5,0) +Grid : Message : 1.717547 s : omega[11] = (0.5,0) +Grid : Message : 1.717550 s : Nu 4 +Grid : Message : 1.717551 s : Nk 128 +Grid : Message : 1.717552 s : Np 16 +Grid : Message : 1.717553 s : Nm 288 +Grid : Message : 1.717554 s : Nstop 100 +Grid : Message : 1.717555 s : Ntest 25 +Grid : Message : 1.717557 s : MaxIter 10 +Grid : Message : 1.717558 s : resid 1e-05 +Grid : Message : 1.717560 s : Cheby Poly 0.007,7,51 + + diff --git a/tests/lanczos/Test_dwf_block_lanczos.cc b/tests/lanczos/Test_dwf_block_lanczos.cc new file mode 100644 index 00000000..7449e32a --- /dev/null +++ b/tests/lanczos/Test_dwf_block_lanczos.cc @@ -0,0 +1,408 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_block_lanczos.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +using namespace std; +using namespace Grid; +//using namespace Grid::QCD; + +//typedef typename GparityDomainWallFermionR::FermionField FermionField; +typedef typename ZMobiusFermionF::FermionField FermionField; + +RealD AllZero(RealD x){ return 0.;} + +class CmdJobParams +{ + public: + std::string gaugefile; + + int Ls; + double mass; + double M5; + double mob_b; + std::vector omega; + std::vector boundary_phase; + std::vector mpi_split; + + LanczosType Impl; + int Nu; + int Nk; + int Np; + int Nm; + int Nstop; + int Ntest; + int MaxIter; + double resid; + + double low; + double high; + int order; + + CmdJobParams() + : gaugefile("Hot"), + Ls(8), mass(0.01), M5(1.8), mob_b(1.5), + Impl(LanczosType::irbl),mpi_split(4,1), + Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8), + low(0.2), high(5.5), order(11) + {Nm=Nk+Np;}; + + void Parse(char **argv, int argc); +}; + + +void CmdJobParams::Parse(char **argv,int argc) +{ + std::string arg; + std::vector vi; + double re,im; + int expect, idx; + std::string vstr; + std::ifstream pfile; + + if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){ + gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf"); + } + + if( GridCmdOptionExists(argv,argv+argc,"--phase") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--phase"); + pfile.open(arg); + assert(pfile); + expect = 0; + while( pfile >> vstr ) { + if ( vstr.compare("boundary_phase") == 0 ) { + pfile >> vstr; + GridCmdOptionInt(vstr,idx); + assert(expect==idx); + pfile >> vstr; + GridCmdOptionFloat(vstr,re); + pfile >> vstr; + GridCmdOptionFloat(vstr,im); + boundary_phase.push_back({re,im}); + expect++; + } + } + pfile.close(); + } else { + for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.}); + } + + if( GridCmdOptionExists(argv,argv+argc,"--omega") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--omega"); + pfile.open(arg); + assert(pfile); + Ls = 0; + while( pfile >> vstr ) { + if ( vstr.compare("omega") == 0 ) { + pfile >> vstr; + GridCmdOptionInt(vstr,idx); + assert(Ls==idx); + pfile >> vstr; + GridCmdOptionFloat(vstr,re); + pfile >> vstr; + GridCmdOptionFloat(vstr,im); + omega.push_back({re,im}); + Ls++; + } + } + pfile.close(); + } else { + if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--Ls"); + GridCmdOptionInt(arg,Ls); + } + } + + if( GridCmdOptionExists(argv,argv+argc,"--mass") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--mass"); + GridCmdOptionFloat(arg,mass); + } + + if( GridCmdOptionExists(argv,argv+argc,"--M5") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--M5"); + GridCmdOptionFloat(arg,M5); + } + + if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b"); + GridCmdOptionFloat(arg,mob_b); + } + + if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--irbl"); + GridCmdOptionIntVector(arg,vi); + Nu = vi[0]; + Nk = vi[1]; + Np = vi[2]; + Nstop = vi[3]; + MaxIter = vi[4]; + // ypj[fixme] mode overriding message is needed. + Impl = LanczosType::irbl; + Nm = Nk+Np; + } + + // block Lanczos with explicit extension of its dimensions + if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--rbl"); + GridCmdOptionIntVector(arg,vi); + Nu = vi[0]; + Nk = vi[1]; + Np = vi[2]; // vector space is enlarged by adding Np vectors + Nstop = vi[3]; + MaxIter = vi[4]; + // ypj[fixme] mode overriding message is needed. + Impl = LanczosType::rbl; + Nm = Nk+Np*MaxIter; + } + +#if 1 + // block Lanczos with explicit extension of its dimensions + if( GridCmdOptionExists(argv,argv+argc,"--split") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--split"); + GridCmdOptionIntVector(arg,vi); + for(int i=0;i seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + // ypj [note] why seed RNG5 again? bug? In this case, run with a default seed(). + GridParallelRNG RNG5rb(FrbGridF); RNG5rb.SeedFixedIntegers(seeds5); + + LatticeGaugeField Umu(UGrid); + std::vector U(4,UGrid); + LatticeGaugeFieldF UmuF(UGridF); + std::vector UF(4,UGridF); + + if ( JP.gaugefile.compare("Hot") == 0 ) { + SU3::HotConfiguration(RNG4, Umu); + } else { + FieldMetaData header; + NerscIO::readConfiguration(Umu,header,JP.gaugefile); + // ypj [fixme] additional checks for the loaded configuration? + } + precisionChange (UmuF,Umu); + + for(int mu=0;mu(Umu,mu); + } + + RealD mass = JP.mass; + RealD M5 = JP.M5; + +// ypj [fixme] flexible support for a various Fermions +// RealD mob_b = JP.mob_b; // Gparity +// std::vector omega; // ZMobius + +// GparityMobiusFermionD ::ImplParams params; +// std::vector twists({1,1,1,0}); +// params.twists = twists; +// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params); +// SchurDiagTwoOperator HermOp(Ddwf); + + +// int mrhs = JP.Nu; + int Ndir=4; + auto mpi_layout = GridDefaultMpi(); + std::vector mpi_split (Ndir,1); +#if 0 + int tmp=mrhs, dir=0; + std::cout << GridLogMessage << "dir= "<_processor,re); + src_tmp=re; + pickCheckerboard(Odd,src[i],src_tmp); + } + RNG5.Report(); +} else { + std::cout << GridLogMessage << "Using RNG5rb"< evec(JP.Nm,FrbGridF); + for(int i=0;i<1;++i){ + std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl; + }; + + int Nconv; + IRBL.calc(eval,evec,src,Nconv,JP.Impl); + + + Grid_finalize(); +} diff --git a/tests/lanczos/Test_dwf_block_lanczos.cc.double b/tests/lanczos/Test_dwf_block_lanczos.cc.double new file mode 100644 index 00000000..c71b80ec --- /dev/null +++ b/tests/lanczos/Test_dwf_block_lanczos.cc.double @@ -0,0 +1,401 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_block_lanczos.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +using namespace std; +using namespace Grid; +//using namespace Grid::QCD; + +//typedef typename GparityDomainWallFermionR::FermionField FermionField; +typedef typename ZMobiusFermionR::FermionField FermionField; + +RealD AllZero(RealD x){ return 0.;} + +class CmdJobParams +{ + public: + std::string gaugefile; + + int Ls; + double mass; + double M5; + double mob_b; + std::vector omega; + std::vector boundary_phase; + std::vector mpi_split; + + LanczosType Impl; + int Nu; + int Nk; + int Np; + int Nm; + int Nstop; + int Ntest; + int MaxIter; + double resid; + + double low; + double high; + int order; + + CmdJobParams() + : gaugefile("Hot"), + Ls(8), mass(0.01), M5(1.8), mob_b(1.5), + Impl(LanczosType::irbl),mpi_split(4,1), + Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8), + low(0.2), high(5.5), order(11) + {Nm=Nk+Np;}; + + void Parse(char **argv, int argc); +}; + + +void CmdJobParams::Parse(char **argv,int argc) +{ + std::string arg; + std::vector vi; + double re,im; + int expect, idx; + std::string vstr; + std::ifstream pfile; + + if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){ + gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf"); + } + + if( GridCmdOptionExists(argv,argv+argc,"--phase") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--phase"); + pfile.open(arg); + assert(pfile); + expect = 0; + while( pfile >> vstr ) { + if ( vstr.compare("boundary_phase") == 0 ) { + pfile >> vstr; + GridCmdOptionInt(vstr,idx); + assert(expect==idx); + pfile >> vstr; + GridCmdOptionFloat(vstr,re); + pfile >> vstr; + GridCmdOptionFloat(vstr,im); + boundary_phase.push_back({re,im}); + expect++; + } + } + pfile.close(); + } else { + for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.}); + } + + if( GridCmdOptionExists(argv,argv+argc,"--omega") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--omega"); + pfile.open(arg); + assert(pfile); + Ls = 0; + while( pfile >> vstr ) { + if ( vstr.compare("omega") == 0 ) { + pfile >> vstr; + GridCmdOptionInt(vstr,idx); + assert(Ls==idx); + pfile >> vstr; + GridCmdOptionFloat(vstr,re); + pfile >> vstr; + GridCmdOptionFloat(vstr,im); + omega.push_back({re,im}); + Ls++; + } + } + pfile.close(); + } else { + if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--Ls"); + GridCmdOptionInt(arg,Ls); + } + } + + if( GridCmdOptionExists(argv,argv+argc,"--mass") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--mass"); + GridCmdOptionFloat(arg,mass); + } + + if( GridCmdOptionExists(argv,argv+argc,"--M5") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--M5"); + GridCmdOptionFloat(arg,M5); + } + + if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b"); + GridCmdOptionFloat(arg,mob_b); + } + + if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--irbl"); + GridCmdOptionIntVector(arg,vi); + Nu = vi[0]; + Nk = vi[1]; + Np = vi[2]; + Nstop = vi[3]; + MaxIter = vi[4]; + // ypj[fixme] mode overriding message is needed. + Impl = LanczosType::irbl; + Nm = Nk+Np; + } + + // block Lanczos with explicit extension of its dimensions + if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--rbl"); + GridCmdOptionIntVector(arg,vi); + Nu = vi[0]; + Nk = vi[1]; + Np = vi[2]; // vector space is enlarged by adding Np vectors + Nstop = vi[3]; + MaxIter = vi[4]; + // ypj[fixme] mode overriding message is needed. + Impl = LanczosType::rbl; + Nm = Nk+Np*MaxIter; + } + +#if 1 + // block Lanczos with explicit extension of its dimensions + if( GridCmdOptionExists(argv,argv+argc,"--split") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--split"); + GridCmdOptionIntVector(arg,vi); + for(int i=0;i seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + // ypj [note] why seed RNG5 again? bug? In this case, run with a default seed(). + GridParallelRNG RNG5rb(FrbGrid); RNG5rb.SeedFixedIntegers(seeds5); + + LatticeGaugeField Umu(UGrid); + std::vector U(4,UGrid); + + if ( JP.gaugefile.compare("Hot") == 0 ) { + SU3::HotConfiguration(RNG4, Umu); + } else { + FieldMetaData header; + NerscIO::readConfiguration(Umu,header,JP.gaugefile); + // ypj [fixme] additional checks for the loaded configuration? + } + + for(int mu=0;mu(Umu,mu); + } + + RealD mass = JP.mass; + RealD M5 = JP.M5; + +// ypj [fixme] flexible support for a various Fermions +// RealD mob_b = JP.mob_b; // Gparity +// std::vector omega; // ZMobius + +// GparityMobiusFermionD ::ImplParams params; +// std::vector twists({1,1,1,0}); +// params.twists = twists; +// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params); +// SchurDiagTwoOperator HermOp(Ddwf); + + +// int mrhs = JP.Nu; + int Ndir=4; + auto mpi_layout = GridDefaultMpi(); + std::vector mpi_split (Ndir,1); +#if 0 + int tmp=mrhs, dir=0; + std::cout << GridLogMessage << "dir= "<_processor,re); + src_tmp=re; + pickCheckerboard(Odd,src[i],src_tmp); + } + RNG5.Report(); +} else { + std::cout << GridLogMessage << "Using RNG5rb"< evec(JP.Nm,FrbGrid); + for(int i=0;i<1;++i){ + std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl; + }; + + int Nconv; + IRBL.calc(eval,evec,src,Nconv,JP.Impl); + + + Grid_finalize(); +} diff --git a/tests/lanczos/Test_dwf_block_lanczos.cc.single b/tests/lanczos/Test_dwf_block_lanczos.cc.single new file mode 100644 index 00000000..7449e32a --- /dev/null +++ b/tests/lanczos/Test_dwf_block_lanczos.cc.single @@ -0,0 +1,408 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_block_lanczos.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +using namespace std; +using namespace Grid; +//using namespace Grid::QCD; + +//typedef typename GparityDomainWallFermionR::FermionField FermionField; +typedef typename ZMobiusFermionF::FermionField FermionField; + +RealD AllZero(RealD x){ return 0.;} + +class CmdJobParams +{ + public: + std::string gaugefile; + + int Ls; + double mass; + double M5; + double mob_b; + std::vector omega; + std::vector boundary_phase; + std::vector mpi_split; + + LanczosType Impl; + int Nu; + int Nk; + int Np; + int Nm; + int Nstop; + int Ntest; + int MaxIter; + double resid; + + double low; + double high; + int order; + + CmdJobParams() + : gaugefile("Hot"), + Ls(8), mass(0.01), M5(1.8), mob_b(1.5), + Impl(LanczosType::irbl),mpi_split(4,1), + Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8), + low(0.2), high(5.5), order(11) + {Nm=Nk+Np;}; + + void Parse(char **argv, int argc); +}; + + +void CmdJobParams::Parse(char **argv,int argc) +{ + std::string arg; + std::vector vi; + double re,im; + int expect, idx; + std::string vstr; + std::ifstream pfile; + + if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){ + gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf"); + } + + if( GridCmdOptionExists(argv,argv+argc,"--phase") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--phase"); + pfile.open(arg); + assert(pfile); + expect = 0; + while( pfile >> vstr ) { + if ( vstr.compare("boundary_phase") == 0 ) { + pfile >> vstr; + GridCmdOptionInt(vstr,idx); + assert(expect==idx); + pfile >> vstr; + GridCmdOptionFloat(vstr,re); + pfile >> vstr; + GridCmdOptionFloat(vstr,im); + boundary_phase.push_back({re,im}); + expect++; + } + } + pfile.close(); + } else { + for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.}); + } + + if( GridCmdOptionExists(argv,argv+argc,"--omega") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--omega"); + pfile.open(arg); + assert(pfile); + Ls = 0; + while( pfile >> vstr ) { + if ( vstr.compare("omega") == 0 ) { + pfile >> vstr; + GridCmdOptionInt(vstr,idx); + assert(Ls==idx); + pfile >> vstr; + GridCmdOptionFloat(vstr,re); + pfile >> vstr; + GridCmdOptionFloat(vstr,im); + omega.push_back({re,im}); + Ls++; + } + } + pfile.close(); + } else { + if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--Ls"); + GridCmdOptionInt(arg,Ls); + } + } + + if( GridCmdOptionExists(argv,argv+argc,"--mass") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--mass"); + GridCmdOptionFloat(arg,mass); + } + + if( GridCmdOptionExists(argv,argv+argc,"--M5") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--M5"); + GridCmdOptionFloat(arg,M5); + } + + if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b"); + GridCmdOptionFloat(arg,mob_b); + } + + if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--irbl"); + GridCmdOptionIntVector(arg,vi); + Nu = vi[0]; + Nk = vi[1]; + Np = vi[2]; + Nstop = vi[3]; + MaxIter = vi[4]; + // ypj[fixme] mode overriding message is needed. + Impl = LanczosType::irbl; + Nm = Nk+Np; + } + + // block Lanczos with explicit extension of its dimensions + if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--rbl"); + GridCmdOptionIntVector(arg,vi); + Nu = vi[0]; + Nk = vi[1]; + Np = vi[2]; // vector space is enlarged by adding Np vectors + Nstop = vi[3]; + MaxIter = vi[4]; + // ypj[fixme] mode overriding message is needed. + Impl = LanczosType::rbl; + Nm = Nk+Np*MaxIter; + } + +#if 1 + // block Lanczos with explicit extension of its dimensions + if( GridCmdOptionExists(argv,argv+argc,"--split") ){ + arg = GridCmdOptionPayload(argv,argv+argc,"--split"); + GridCmdOptionIntVector(arg,vi); + for(int i=0;i seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + // ypj [note] why seed RNG5 again? bug? In this case, run with a default seed(). + GridParallelRNG RNG5rb(FrbGridF); RNG5rb.SeedFixedIntegers(seeds5); + + LatticeGaugeField Umu(UGrid); + std::vector U(4,UGrid); + LatticeGaugeFieldF UmuF(UGridF); + std::vector UF(4,UGridF); + + if ( JP.gaugefile.compare("Hot") == 0 ) { + SU3::HotConfiguration(RNG4, Umu); + } else { + FieldMetaData header; + NerscIO::readConfiguration(Umu,header,JP.gaugefile); + // ypj [fixme] additional checks for the loaded configuration? + } + precisionChange (UmuF,Umu); + + for(int mu=0;mu(Umu,mu); + } + + RealD mass = JP.mass; + RealD M5 = JP.M5; + +// ypj [fixme] flexible support for a various Fermions +// RealD mob_b = JP.mob_b; // Gparity +// std::vector omega; // ZMobius + +// GparityMobiusFermionD ::ImplParams params; +// std::vector twists({1,1,1,0}); +// params.twists = twists; +// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params); +// SchurDiagTwoOperator HermOp(Ddwf); + + +// int mrhs = JP.Nu; + int Ndir=4; + auto mpi_layout = GridDefaultMpi(); + std::vector mpi_split (Ndir,1); +#if 0 + int tmp=mrhs, dir=0; + std::cout << GridLogMessage << "dir= "<_processor,re); + src_tmp=re; + pickCheckerboard(Odd,src[i],src_tmp); + } + RNG5.Report(); +} else { + std::cout << GridLogMessage << "Using RNG5rb"< evec(JP.Nm,FrbGridF); + for(int i=0;i<1;++i){ + std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl; + }; + + int Nconv; + IRBL.calc(eval,evec,src,Nconv,JP.Impl); + + + Grid_finalize(); +} diff --git a/tests/lanczos/Test_dwf_lanczos.cc b/tests/lanczos/Test_dwf_lanczos.cc index 1fe29bb2..2ee5299b 100644 --- a/tests/lanczos/Test_dwf_lanczos.cc +++ b/tests/lanczos/Test_dwf_lanczos.cc @@ -35,26 +35,43 @@ template struct Setup{}; template<> -struct Setup{ - static GparityMobiusFermionR* getAction(LatticeGaugeField &Umu, +struct Setup{ + static GparityMobiusFermionF* getAction(LatticeGaugeFieldF &Umu, GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){ - RealD mass=0.01; + RealD mass=0.00054; RealD M5=1.8; RealD mob_b=1.5; GparityMobiusFermionD ::ImplParams params; std::vector twists({1,1,1,0}); params.twists = twists; - return new GparityMobiusFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params); + return new GparityMobiusFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params); } }; template<> -struct Setup{ - static DomainWallFermionR* getAction(LatticeGaugeField &Umu, +struct Setup{ + static DomainWallFermionF* getAction(LatticeGaugeFieldF &Umu, GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){ - RealD mass=0.01; + RealD mass=0.00054; RealD M5=1.8; - return new DomainWallFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + return new DomainWallFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + } +}; + +template<> +struct Setup{ + static MobiusFermionF* getAction(LatticeGaugeFieldF &Umu, + GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){ + RealD mass=0.00054; + RealD M5=1.8; + RealD mob_b=1.5; + std::vector boundary = {1,1,1,-1}; + MobiusFermionF::ImplParams Params(boundary); + + std::cout << GridLogMessage << "mass "<{ template void run(){ typedef typename Action::FermionField FermionField; - const int Ls=8; + const int Ls=12; GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); - printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid); +// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid); + + GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF); + GridCartesian* FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls, UGridF); + GridRedBlackCartesian* FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGridF); + std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); - GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); - GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGridF); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5rb(FrbGridF); RNG5.SeedFixedIntegers(seeds5); LatticeGaugeField Umu(UGrid); - SU::HotConfiguration(RNG4, Umu); +// SU::HotConfiguration(RNG4, Umu); + FieldMetaData header; + std::string file("./config"); - Action *action = Setup::getAction(Umu,FGrid,FrbGrid,UGrid,UrbGrid); +// int precision32 = 0; +// int tworow = 0; +// NerscIO::writeConfiguration(Umu,file,tworow,precision32); + NerscIO::readConfiguration(Umu,header,file); + + LatticeGaugeFieldF UmuF(UGridF); + precisionChange(UmuF, Umu); + + Action *action = Setup::getAction(UmuF,FGridF,FrbGridF,UGridF,UrbGridF); //MdagMLinearOperator HermOp(Ddwf); - SchurDiagTwoOperator HermOp(*action); +// SchurDiagTwoOperator HermOp(*action); + SchurDiagOneOperator HermOp(*action); - const int Nstop = 30; - const int Nk = 40; + const int Nstop = 150; + const int Nk = 160; const int Np = 40; const int Nm = Nk+Np; const int MaxIt= 10000; - RealD resid = 1.0e-8; + RealD resid = 1.0e-6; + std::cout << GridLogMessage << "Nstop "< Coeffs { 0.,-1.}; Polynomial PolyX(Coeffs); - Chebyshev Cheby(0.2,5.,11); + Chebyshev Cheby(0.0000006,5.5,4001); + std::cout << GridLogMessage << "Cheby(0.0000006,5.5,4001) "< OpCheby(Cheby,HermOp); - PlainHermOp Op (HermOp); + PlainHermOp Op (HermOp); ImplicitlyRestartedLanczos IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt); std::vector eval(Nm); - FermionField src(FrbGrid); + FermionField src(FrbGridF); gaussian(RNG5rb,src); - std::vector evec(Nm,FrbGrid); + std::vector evec(Nm,FrbGridF); for(int i=0;i<1;i++){ std::cout << GridLogMessage <(); + run(); }else if(action == "DWF"){ - run(); + run(); + }else if(action == "Mobius"){ + run(); }else{ std::cout << "Unknown action" << std::endl; exit(1); From dc6a38f17739ab60a0ceb5d422e18b18ba05efb5 Mon Sep 17 00:00:00 2001 From: Chulwoo Jung Date: Wed, 30 Nov 2022 17:13:12 -0500 Subject: [PATCH 203/240] Minor cleanup --- .../ImplicitlyRestartedBlockLanczos.h | 145 +----------------- tests/lanczos/Test_dwf_block_lanczos.cc | 4 +- 2 files changed, 4 insertions(+), 145 deletions(-) diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h index fe388c19..c5d00722 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h @@ -7,9 +7,8 @@ Copyright (C) 2015 Author: Peter Boyle -Author: Chulwoo Jung Author: Yong-Chull Jang -Author: Guido Cossu +Author: Chulwoo Jung This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -185,10 +184,6 @@ public: { RealD nn = norm2(v); nn = sqrt(nn); -#if 0 - if(if_print && nn < 1e20) - Glog<<"normalize: "< 1e-14) - Glog<<"orthogonalize before: "< 1e-14) - Glog<<"orthogonalize after: "<GlobalSumVector((CUDA_FLOAT*)c_acc,2*Nu*Nevec_acc); -#if 0 - for (int i=0; i[" << j << "," << i << "] = " - << z.x << " + i " << z.y << std::endl; - } - w[i] = w[i] - ip * evec[b*Nevec_acc+j]; - } - } -#else alpha = MAKE_CUDA_COMPLEX(-1.0,0.0); beta = MAKE_CUDA_COMPLEX(1.0,0.0); stat = CUDA_GEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, 12*sites, Nu, Nevec_acc, @@ -340,9 +292,7 @@ public: &beta, w_acc, 12*sites); //Glog << stat << std::endl; -#endif } -#if 1 for (int col=0; col &inner, std::vector& lhs, int llhs, std::vector& rhs, int lrhs) -{ - typedef typename Field:vector_object vobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_typeD vector_type; - GridBase *grid = lhs[0]._grid; - assert(grid == rhs[0]._grid; - const int pad = 8; - int total = llhs*lrhs; - assert(inner.size()==total); - int sum_size=grid->SumArraySize(); - -// std::vector inner(total); - Vector sumarray(sum_size*pad*total); - - parallel_for(int thr=0;throSites(),thr,mywork,myoff); - - std::vector< decltype(innerProductD(lhs[0]._odata[0],rhs[0]._odata[0])) > vinner(total,zero); // private to thread; sub summation - for(int ss=myoff;ssGlobalSum(tmp); - inner[i]=tmp; - } -// return inner; -} -#endif - void orthogonalize_blockhead(Field& w, std::vector& evec, int k, int Nu) { @@ -839,14 +738,6 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_ Glog << fname + " CONVERGED ; Summary :\n"; // Sort convered eigenpairs. std::vector Btmp(Nstop,grid); // waste of space replicating -#if 0 - for(int i=0; i0) { - for (int u=0; u0) { - // orthogonalize_blockhead(w[0],evec,b,Nu); - // for (int u=1; uGlobalSumVector((double*)evec_tmp,2*NN*NN); } } - // Safer to sort instead of just reversing it, - // but the document of the routine says evals are sorted in increasing order. - // qr gives evals in decreasing order. -// for(int i=0;i +Author: Yong-Chull Jang +Author: Chulwoo Jung This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by From ff6777a98daaa7fd433e15cf0256ca50d3dae9dd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Dec 2022 08:13:51 -0500 Subject: [PATCH 204/240] Variable depth experiments --- Grid/qcd/action/fermion/WilsonCompressor.h | 36 ++++++++++++++++------ 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index f62b3aba..fd1bbe89 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -36,13 +36,16 @@ NAMESPACE_BEGIN(Grid); // Wilson compressor will need FaceGather policies for: // Periodic, Dirichlet, and partial Dirichlet for DWF /////////////////////////////////////////////////////////////// -const int dwf_compressor_depth=2; +const int dwf_compressor_depth=1; +#define DWF_COMPRESS class FaceGatherPartialDWF { public: - // static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; - static int PartialCompressionFactor(GridBase *grid) {return 1;} - // static int PartialCompressionFactor(GridBase *grid) { return 1;} +#ifdef DWF_COMPRESS + static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; +#else + static int PartialCompressionFactor(GridBase *grid) { return 1;} +#endif template static void Gather_plane_simple (commVector >& table, const Lattice &rhs, @@ -54,8 +57,11 @@ public: // Shrinks local and remote comms buffers GridBase *Grid = rhs.Grid(); int Ls = Grid->_rdimensions[0]; - // int depth=dwf_compressor_depth; +#ifdef DWF_COMPRESS + int depth=dwf_compressor_depth; +#else int depth=Ls/2; +#endif std::pair *table_v = & table[0]; auto rhs_v = rhs.View(AcceleratorRead); int vol=table.size()/Ls; @@ -72,8 +78,11 @@ public: static void DecompressFace(decompressor decompress,Decompression &dd) { auto Ls = dd.dims[0]; - // int depth=dwf_compressor_depth; +#ifdef DWF_COMPRESS + int depth=dwf_compressor_depth; +#else int depth=Ls/2; +#endif // Just pass in the Grid auto kp = dd.kernel_p; auto mp = dd.mpi_p; @@ -106,8 +115,11 @@ public: { GridBase *Grid = rhs.Grid(); int Ls = Grid->_rdimensions[0]; - // int depth=dwf_compressor_depth; +#ifdef DWF_COMPRESS + int depth=dwf_compressor_depth; +#else int depth = Ls/2; +#endif // insertion of zeroes... assert( (table.size()&0x1)==0); @@ -146,8 +158,11 @@ public: static void MergeFace(decompressor decompress,Merger &mm) { auto Ls = mm.dims[0]; +#ifdef DWF_COMPRESS + int depth=dwf_compressor_depth; +#else int depth = Ls/2; - // int depth=dwf_compressor_depth; +#endif int num= mm.buffer_size/2; // relate vol and Ls to buffer size auto mp = &mm.mpointer[0]; auto vp0= &mm.vpointers[0][0]; // First arg is exchange first @@ -175,8 +190,11 @@ public: class FaceGatherDWFMixedBCs { public: - // static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; +#ifdef DWF_COMPRESS + static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; +#else static int PartialCompressionFactor(GridBase *grid) {return 1;} +#endif template static void Gather_plane_simple (commVector >& table, From b54d0f3c730009fb4992e7ff09d0417a23d56747 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Dec 2022 08:14:27 -0500 Subject: [PATCH 205/240] Smaller deltaH down to 7000s on t=0.5 trajectory --- HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc index e85bec7e..0a924486 100644 --- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc +++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc @@ -181,7 +181,7 @@ int main(int argc, char **argv) { // MD.name = std::string("MinimumNorm2"); // TrajL = 2 // 4/2 => 0.6 dH - // 3/3 => ?? dH + // 3/3 => 0.8 dH .. depth 3, slower //MD.MDsteps = 4; MD.MDsteps = 3; MD.trajL = 0.5; @@ -223,13 +223,15 @@ int main(int argc, char **argv) { RealD c = 0.5; Real beta = 2.13; // Real light_mass = 5.4e-4; - Real light_mass = 7.8e-4; + Real light_mass = 7.8e-4; + Real light_mass_dir = 0.01; Real strange_mass = 0.0362; Real pv_mass = 1.0; - std::vector hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + std::vector hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + // std::vector hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + // std::vector hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); - OneFlavourRationalParams OFRp; // Up/down OFRp.lo = 4.0e-5; OFRp.hi = 90.0; @@ -238,18 +240,24 @@ int main(int argc, char **argv) { OFRp.mdtolerance= 1.0e-3; // OFRp.degree = 20; converges // OFRp.degree = 16; - OFRp.degree = 12; + OFRp.degree = 18; OFRp.precision= 80; OFRp.BoundsCheckFreq=0; std::vector ActionTolByPole({ 1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8, - 1.0e-8,1.0e-8,1.0e-8,1.0e-8 + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8 }); std::vector MDTolByPole({ - 1.0e-6,3.0e-7,1.0e-7,1.0e-7, + 1.0e-5,5.0e-6,1.0e-6,1.0e-7, // soften convergence more more + // 1.0e-6,3.0e-7,1.0e-7,1.0e-7, + // 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence 1.0e-8,1.0e-8,1.0e-8,1.0e-8, - 1.0e-8,1.0e-8,1.0e-8,1.0e-8 + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8 }); auto GridPtr = TheHMC.Resources.GetCartesian(); @@ -540,12 +548,12 @@ int main(int argc, char **argv) { *Numerators[h],*Denominators[h], *NumeratorsF[h],*DenominatorsF[h], *NumeratorsD2[h],*DenominatorsD2[h], - OFRp, 200) ); + OFRp, 400) ); Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction( *Numerators[h],*Denominators[h], *NumeratorsF[h],*DenominatorsF[h], *NumeratorsD2[h],*DenominatorsD2[h], - OFRp, 200) ); + OFRp, 400) ); #else Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); From 5bb7ba92faa362aca13803a1f6a9e9a12dacd5ef Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Dec 2022 08:15:11 -0500 Subject: [PATCH 206/240] Test for DDHMC force term --- tests/forces/Test_double_ratio.cc | 510 ++++++++++++++++++++++++++++++ 1 file changed, 510 insertions(+) create mode 100644 tests/forces/Test_double_ratio.cc diff --git a/tests/forces/Test_double_ratio.cc b/tests/forces/Test_double_ratio.cc new file mode 100644 index 00000000..0a350692 --- /dev/null +++ b/tests/forces/Test_double_ratio.cc @@ -0,0 +1,510 @@ +/* + 2f Full det MdagM 10^6 force ~ 1.3e7 +rid : Message : 1767.283471 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 1767.283476 s : S1 : 1.52885e+09 +Grid : Message : 1767.283480 s : S2 : 1.52886e+09 +Grid : Message : 1767.283482 s : dS : 8877.34 +Grid : Message : 1767.283483 s : dSpred : 8877.7 +Grid : Message : 1767.283484 s : diff : -0.360484 +Grid : Message : 1767.283485 s : ********************************************************* + + 2f Full det MpcdagMpc 10^6 force ~ 1.8e6 +Grid : Message : 2399.576962 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 2399.576968 s : S1 : 1.52885e+09 +Grid : Message : 2399.576972 s : S2 : 1.52886e+09 +Grid : Message : 2399.576974 s : dS : 9728.49 +Grid : Message : 2399.576975 s : dSpred : 9726.58 +Grid : Message : 2399.576976 s : diff : 1.90683 +Grid : Message : 2399.576977 s : ********************************************************* + + 2f bdy MdagM 1500 force Force ~ 2800 +Grid : Message : 4622.385061 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 4622.385067 s : S1 : 1.52885e+09 +Grid : Message : 4622.385071 s : S2 : 1.52885e+09 +Grid : Message : 4622.385072 s : dS : 25.4944 +Grid : Message : 4622.385073 s : dSpred : 25.4672 +Grid : Message : 4622.385074 s : diff : 0.0271414 +Grid : Message : 4622.385075 s : ********************************************************* + + 2f bdy MpcdagMpc 10^6 force Force ~ 2200 +Grid : Message : 4622.385061 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 4622.385067 s : S1 : 1.52885e+09 +Grid : Message : 4622.385071 s : S2 : 1.52885e+09 +Grid : Message : 4622.385072 s : dS : 25.4944 +Grid : Message : 4622.385073 s : dSpred : 25.4672 +Grid : Message : 4622.385074 s : diff : 0.0271414 +Grid : Message : 4622.385075 s : ********************************************************* + + 1f Bdy Det +// +// These all had tol set by OFRp, not through MDpoles +// So assumptions it was Remez might be wrong. +// +Optimisation log: looser rational AND MD tolerances sloppy +MobiusForce.221179 -- same as HMC. dS is mispredicted Forece ~2.8 +Grid : Message : 6582.258991 s : dS : 0.024478 +Grid : Message : 6582.258992 s : dSpred : 0.00791876 +Grid : Message : 6582.258994 s : diff : 0.0165592 + +MobiusForce.221193 -- tight rational AND MD tolerances to 1e-8 ~ 2.8 same +Grid : Message : 1964.939209 s : S1 : 7.64404e+08 +Grid : Message : 1964.939213 s : S2 : 7.64404e+08 +Grid : Message : 1964.939215 s : dS : -0.00775838 <--- too loose even on action +Grid : Message : 1964.939216 s : dSpred : -0.00416793 +Grid : Message : 1964.939217 s : diff : -0.00359045 + +MobiusForce.221394 -- tight rational, MD tol sloppy Force ~ 2.8 +Grid : Message : 2376.921950 s : S1 : 764404436.44069 +Grid : Message : 2376.921954 s : S2 : 764404436.43299 +Grid : Message : 2376.921956 s : dS : -0.0076971054077148 +Grid : Message : 2376.921958 s : dSpred : -0.0041610472282526 +Grid : Message : 2376.921959 s : diff : -0.0035360581794623 + + +MobiusForce.221587 -- slightly sloppier action, coming from tol array + -- much sloppier force + -- degree 18 + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-6,3.0e-7,1.0e-7,1.0e-7, // Orig sloppy +Grid : Message : 2438.875507 s : S1 : 764404436.42251 +Grid : Message : 2438.875512 s : S2 : 764404436.4148 +Grid : Message : 2438.875514 s : dS : -0.0077102184295654 +Grid : Message : 2438.875516 s : dSpred : -0.0075684496959103 +Grid : Message : 2438.875517 s : diff : -0.00014176873365508 + +MobiusForce.221639 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence more + +Grid : Message : 2373.927550 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 2373.927600 s : S1 : 764404436.42251 +Grid : Message : 2373.927640 s : S2 : 764404436.4148 +Grid : Message : 2373.927660 s : dS : -0.0077102184295654 +Grid : Message : 2373.927680 s : dSpred : -0.0075993463919849 +Grid : Message : 2373.927690 s : diff : -0.00011087203758051 +Grid : Message : 2373.927700 s : ********************************************************* + + +Grid : Message : 69.269319 s : ApproxPowerMD shift[0] pole 9.5166866092503e-06 residue -2.0047722631555e-08 tol 3e-06 +Grid : Message : 69.269321 s : ApproxPowerMD shift[1] pole 4.7123486192778e-05 residue -1.316766030683e-07 tol 1e-06 +Grid : Message : 69.269323 s : ApproxPowerMD shift[2] pole 0.00014860967743736 residue -6.109883117444e-07 tol 1e-07 +Grid : Message : 69.269325 s : ApproxPowerMD shift[3] pole 0.00041055696132763 residue -2.6088717433891e-06 tol 1e-07 +Grid : Message : 69.269327 s : ApproxPowerMD shift[4] pole 0.0010822555692906 residue -1.0853799412802e-05 tol 1e-08 +Grid : Message : 69.269329 s : ApproxPowerMD shift[5] pole 0.0028029613512087 residue -4.4741734470158e-05 tol 1e-08 +Grid : Message : 69.269331 s : ApproxPowerMD shift[6] pole 0.0072103567378527 residue -0.00018380499193253 tol 1e-08 + +rusher 96I]$ more MobiusForce.221887 + 1.0e-5,3.0e-6,3.0e-7,1.0e-7, // soften convergence more more +// <-- this is the dirichlet solve, why poorer conditioned??? +Grid : Message : 1627.226206 s : ConjugateGradientMultiShift k=3643 Shift 3 has converged +Grid : Message : 1667.373045 s : ConjugateGradientMultiShift k=5381 Shift 2 has converged +Grid : Message : 1705.236992 s : ConjugateGradientMultiShift k=7063 Shift 1 has converged +Grid : Message : 1752.493182 s : ConjugateGradientMultiShift k=9220 Shift 0 has converged +// +//Grid : Message : 1414.837250 s : OneFlavourEvenOddRatioRationalPseudoFermionAction deriv: doing (M^dag M)^{-1/2} ( (V^dag V)^{1/4} Phi) +Grid : Message : 1523.416680 s : ConjugateGradientMultiShift k=3846 Shift 2 has converged +Grid : Message : 1530.798503 s : ConjugateGradientMultiShift k=4143 Shift 1 has converged +Grid : Message : 1536.153421 s : ConjugateGradientMultiShift k=4353 Shift 0 has converged <-- this is the non-dirichlet solve + +Grid : Message : 2339.927565 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 2339.927571 s : S1 : 764404436.42251 +Grid : Message : 2339.927575 s : S2 : 764404436.4148 +Grid : Message : 2339.927577 s : dS : -0.0077102184295654 +Grid : Message : 2339.927579 s : dSpred : -0.0068752425267964 +Grid : Message : 2339.927580 s : diff : -0.00083497590276901 +Grid : Message : 2339.927581 s : ********************************************************* +Grid : Message : 2339.927582 s : Done +Grid : Message : 2339.927582 s : ********************************************************* + +Force 76 S {S {S {(9.0175185326468,-3.5764415623768e-36)}}} +Force 77 S {S {S {(4.1289977678493,-4.3364721285803e-37)}}} +Force 78 S {S {S {(3.2299269465841,6.0391022273495e-37)}}} +Force 79 S {S {S {(3.0051199649288,-9.6243599973575e-37)}}} +Force 80 S {S {S {(2.8924316727872,-1.3371248240604e-37)}}} +Force 81 S {S {S {(2.8270868791781,1.792628885004e-37)}}} +Force 82 S {S {S {(2.8676819960087,-1.3518185034456e-36)}}} +Force 83 S {S {S {(2.7724152154523,1.4950818774521e-37)}}} +Force 84 S {S {S {(3.0204624534964,-9.6475025423893e-36)}}} +Force 85 S {S {S {(2.8631304063459,2.2426228161781e-37)}}} +Force 86 S {S {S {(2.9025673908905,-1.3942465026706e-36)}}} +Force 87 S {S {S {(2.8553405232646,-2.0938493124022e-38)}}} +Force 88 S {S {S {(3.2820184381375,-1.422348164495e-36)}}} +Force 89 S {S {S {(3.8974980085791,1.1682209795266e-35)}}} +Force 90 S {S {S {(4.660053618223,-1.4399805797573e-37)}}} +Force 91 S {S {S {(6.7993872372366,1.4524702072348e-36)}}} +Full +Grid : Message : 1523.416680 s : ConjugateGradientMultiShift k=3846 Shift 2 has converged +Grid : Message : 1530.798503 s : ConjugateGradientMultiShift k=4143 Shift 1 has converged +Grid : Message : 1536.153421 s : ConjugateGradientMultiShift k=4353 Shift 0 has converged +PV solve depth 3 +Grid : Message : 1667.373045 s : ConjugateGradientMultiShift k=5381 Shift 2 has converged +Grid : Message : 1705.236992 s : ConjugateGradientMultiShift k=7063 Shift 1 has converged +Grid : Message : 1752.493182 s : ConjugateGradientMultiShift k=9220 Shift 0 has converged + +MobiusForce.222490 depth 1 +Grid : Message : 2155.595070 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 2155.595076 s : S1 : 764404436.37475 +Grid : Message : 2155.595080 s : S2 : 764404436.21131 +Grid : Message : 2155.595082 s : dS : -0.16344606876373 +Grid : Message : 2155.595084 s : dSpred : -0.16235663327375 +Grid : Message : 2155.595085 s : diff : -0.0010894354899788 + +Force 4 S {S {S {(24.512489110423,-7.4203080895657e-36)}}} +Force 5 S {S {S {(14.442663101577,7.3909207307951e-37)}}} +Force 6 S {S {S {(12.298567945213,2.1989091200069e-36)}}} +Force 7 S {S {S {(11.582362859271,-2.2540104177017e-36)}}} +Force 8 S {S {S {(11.465725500906,-2.9512255045332e-36)}}} +Force 9 S {S {S {(10.869067954412,-2.8388188572358e-36)}}} +Force 10 S {S {S {(10.937111429576,-3.3530976357206e-36)}}} +Force 11 S {S {S {(11.23500117508,-1.4487967873885e-36)}}} +Force 12 S {S {S {(10.900736551834,5.1427877848475e-36)}}} Force is bigger +Force 13 S {S {S {(10.951921323651,-1.2098775605838e-35)}}} +Force 14 S {S {S {(10.676529230575,-2.50527233519e-36)}}} +Force 15 S {S {S {(10.98568474467,3.2193851533145e-36)}}} +Force 16 S {S {S {(11.931707726568,-8.5223340434616e-37)}}} +Force 17 S {S {S {(13.751904678482,7.6337337826369e-36)}}} +Force 18 S {S {S {(17.518955473833,1.8073225643893e-36)}}} +Force 19 S {S {S {(20.36519304598,-2.5184966466368e-36)}}} +Full solve +Grid : Message : 1441.297575 s : ConjugateGradientMultiShift k=3846 Shift 2 has converged +Grid : Message : 1449.206520 s : ConjugateGradientMultiShift k=4143 Shift 1 has converged +Grid : Message : 1454.352909 s : ConjugateGradientMultiShift k=4353 Shift 0 has converged + +Dirichlet solve -- why so expensive?? +Spectral radius worse? +Grid : Message : 1571.887003 s : ConjugateGradientMultiShift k=5195 Shift 2 has converged +Grid : Message : 1599.543760 s : ConjugateGradientMultiShift k=6508 Shift 1 has converged +Grid : Message : 1625.368198 s : ConjugateGradientMultiShift k=7819 Shift 0 has converged + + +dS is much bigger. + + +MobiusForce.223606 +Grid : Message : 1123.276405 s : ConjugateGradientMultiShift k=3273 Shift 0 has converged +Grid : Message : 1125.945359 s : ConjugateGradientMultiShift k=3407 Shift 1 has converged +Grid : Message : 1127.896580 s : ConjugateGradientMultiShift k=3508 Shift 2 has converged <-- 2 takes longer +first (bdy) hasenbusch mass raised to 0.005 -- reduces Dirchlet solve cost +Force looks ok still +Grid : Message : 1510.884960 s : OneFlavourEvenOddRatioRationalPseudoFermionAction compute action: complete +Grid : Message : 1510.969380 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 1510.969440 s : S1 : 764404436.37475 +Grid : Message : 1510.969480 s : S2 : 764404436.17379 +Grid : Message : 1510.969500 s : dS : -0.20095825195312 +Grid : Message : 1510.969520 s : dSpred : -0.20025674631954 +Grid : Message : 1510.969530 s : diff : -0.00070150563358654 +Force 76 S {S {S {(24.161229317675,2.0147973173094e-35)}}} +Force 77 S {S {S {(15.841085162729,3.983456481349e-36)}}} +Force 78 S {S {S {(11.031761776856,9.0394046210295e-35)}}} +Force 79 S {S {S {(12.177830066719,1.583978637733e-36)}}} +Force 80 S {S {S {(9.8372072482222,6.4284847310594e-37)}}} +Force 81 S {S {S {(9.6588863493149,1.0501572656659e-35)}}} +Force 82 S {S {S {(10.623076227724,-4.4161853392455e-35)}}} +Force 83 S {S {S {(8.9477003784221,-7.067659784319e-37)}}} +Force 84 S {S {S {(9.7663166497594,-2.1014900256825e-35)}}} +Force 85 S {S {S {(8.9992648919057,-4.7107936109203e-36)}}} +Force 86 S {S {S {(9.0399987268337,6.4652189295226e-37)}}} +Force 87 S {S {S {(9.1319052497073,7.9566273871284e-37)}}} +Force 88 S {S {S {(10.094569606113,-1.263656427134e-37)}}} +Force 89 S {S {S {(11.563679905523,-1.2777623593438e-35)}}} +Force 90 S {S {S {(13.653150474463,2.9093485182852e-37)}}} +Force 91 S {S {S {(16.303719912019,2.9857556510886e-36)}}} + +MobiusForce.223749 +first (bdy) hasenbusch mass raised to 0.01 -- reduces Dirchlet solve cost +Grid : Message : 1374.472462 s : OneFlavourEvenOddRatioRationalPseudoFermionAction compute action: complete +Grid : Message : 1374.479206 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 1374.479211 s : S1 : 764404436.37428 +Grid : Message : 1374.479215 s : S2 : 764404436.20009 +Grid : Message : 1374.479217 s : dS : -0.17418932914734 +Grid : Message : 1374.479219 s : dSpred : -0.17358090105485 +Grid : Message : 1374.479220 s : diff : -0.00060842809248995 +Force 76 S {S {S {(27.006858541753,4.2141472476979e-36)}}} +Force 77 S {S {S {(19.388701462694,-5.1620365048422e-35)}}} +Force 78 S {S {S {(13.502424539662,-2.4038859474316e-35)}}} +Force 79 S {S {S {(15.555776987064,6.0567346426118e-36)}}} +Force 80 S {S {S {(12.752116522904,-2.3720006631655e-35)}}} +Force 81 S {S {S {(12.656857824233,1.6912424972456e-35)}}} +Force 82 S {S {S {(15.159284452724,5.0898905390605e-36)}}} +Force 83 S {S {S {(12.222695136014,-2.2061824913027e-35)}}} +Force 84 S {S {S {(12.92077598466,9.6287681011731e-36)}}} +Force 85 S {S {S {(11.884630495484,2.822655809912e-36)}}} +Force 86 S {S {S {(11.896353116174,1.0926219990893e-35)}}} +Force 87 S {S {S {(11.557019282287,2.1532117771187e-35)}}} +Force 88 S {S {S {(11.945108384613,-3.0210204816133e-36)}}} +Force 89 S {S {S {(13.295373801078,7.3115748621146e-36)}}} +Force 90 S {S {S {(15.373728471417,-7.4923071185536e-36)}}} +Force 91 S {S {S {(17.348173714234,1.0344350287236e-36)}}} + +MobiusForce.223829 + 1.0e-5,5.0e-6,1.0e-6,1.0e-7, // soften convergence more more +Grid : Message : 1000.951387 s : ConjugateGradientMultiShift k=1881 Shift 0 has converged +Grid : Message : 1002.619542 s : ConjugateGradientMultiShift k=1960 Shift 1 has converged +Grid : Message : 1003.726982 s : ConjugateGradientMultiShift k=2014 Shift 4 has converged +Grid : Message : 1005.698741 s : ConjugateGradientMultiShift k=2113 Shift 2 has converged +Grid : Message : 1007.320875 s : ConjugateGradientMultiShift k=2197 Shift 3 has converged +Grid : Message : 1351.171259 s : S1 : 764404436.37428 +Grid : Message : 1351.171263 s : S2 : 764404436.20009 +Grid : Message : 1351.171265 s : dS : -0.17418932914734 +Grid : Message : 1351.171266 s : dSpred : -0.1743248065338 +Grid : Message : 1351.171267 s : diff : 0.00013547738646566 +Force 76 S {S {S {(27.004288088317,6.035575744297e-35)}}} +Force 77 S {S {S {(19.388023720604,-6.9736202362532e-36)}}} +Force 78 S {S {S {(13.502663916173,6.4067380855692e-35)}}} +Force 79 S {S {S {(15.55135748152,1.7219522871608e-35)}}} +Force 80 S {S {S {(12.75135802213,-1.1303847551095e-35)}}} +Force 81 S {S {S {(12.655732786276,1.689773129307e-36)}}} +Force 82 S {S {S {(15.158469055699,-6.7205950772387e-35)}}} +Force 83 S {S {S {(12.222907191126,-1.6775773754173e-35)}}} +Force 84 S {S {S {(12.916025368247,-1.9641041234302e-35)}}} +Force 85 S {S {S {(11.881879452577,-2.3054382955502e-36)}}} +Force 86 S {S {S {(11.897253557199,-3.3617669065579e-35)}}} +Force 87 S {S {S {(11.55717723524,-1.8690360178074e-36)}}} +Force 88 S {S {S {(11.945590605851,-6.7208889508264e-36)}}} +Force 89 S {S {S {(13.298173932749,-1.0322309768158e-35)}}} +Force 90 S {S {S {(15.373845416836,7.4158999857501e-36)}}} +Force 91 S {S {S {(17.348058307158,-1.8514036025451e-36)}}} +-- could make the stopping condition mandatory if shift 0 is converged. +-- Save 20% of iterations and single tunable +*/ + +// +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_double_ratio.cc + + Copyright (C) 2022 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +typedef MobiusFermionD FermionAction; +typedef WilsonImplD FimplD; +typedef WilsonImplD FermionImplPolicy; + +template +void ForceTest(Action &action,LatticeGaugeField & U,MomentumFilterBase &Filter) +{ + GridBase *UGrid = U.Grid(); + + std::vector seeds({1,2,3,5}); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + + LatticeColourMatrix Pmu(UGrid); + LatticeGaugeField P(UGrid); + LatticeGaugeField UdSdU(UGrid); + + std::cout << GridLogMessage << "*********************************************************"<(UdSdU,mu); + Pmu= PeekIndex(P,mu); + dS = dS - trace(Pmu*UdSdUmu)*eps*2.0*2.0; + } + ComplexD dSpred = sum(dS); + RealD diff = S2-S1-dSpred.real(); + + std::cout<< GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt_size[0]/mpi_layout[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt_size[1]/mpi_layout[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt_size[2]/mpi_layout[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt_size[3]/mpi_layout[3] * shm[3]; + + Coordinate Block4(Nd); + Block4[0] = Dirichlet[1]; + Block4[1] = Dirichlet[2]; + Block4[2] = Dirichlet[3]; + Block4[3] = Dirichlet[4]; + + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + FermionAction::ImplParams ParamsDir(boundary); + Params.dirichlet=NonDirichlet; + ParamsDir.dirichlet=Dirichlet; + ParamsDir.partialDirichlet=1; + + ///////////////////// Gauge Field and Gauge Forces //////////////////////////// + LatticeGaugeField U(UGrid); + + RealD beta=6.0; + WilsonGaugeActionR PlaqAction(beta); + IwasakiGaugeActionR RectAction(beta); + + MomentumFilterNone FilterNone; + ForceTest(PlaqAction,U,FilterNone); + ForceTest(RectAction,U,FilterNone); + + //////////////////////////////////// + // Action + //////////////////////////////////// + RealD mass=0.00078; + RealD dmass=0.01; + RealD pvmass=1.0; + RealD M5=1.8; + RealD b=1.5; + RealD c=0.5; + + // Double versions + FermionAction DdwfPeriodic(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,Params); + FermionAction PVPeriodic (U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,pvmass,M5,b,c,Params); + FermionAction DdwfDirichlet(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,dmass,M5,b,c,ParamsDir); + + double StoppingCondition = 1.0e-8; + double MaxCGIterations = 50000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + + //////////////////// Two Flavour Determinant Ratio /////////////////////////////// + TwoFlavourRatioPseudoFermionAction Nf2(PVPeriodic, DdwfPeriodic,CG,CG); + // ForceTest(Nf2,U,FilterNone); + + //////////////////// Two Flavour Determinant force test Even Odd /////////////////////////////// + TwoFlavourEvenOddRatioPseudoFermionAction Nf2eo(PVPeriodic, DdwfPeriodic,CG,CG); + // ForceTest(Nf2eo,U,FilterNone); + + //////////////////// Domain forces //////////////////// + int Width=4; + DDHMCFilter DDHMCFilter(Block4,Width); + + //////////////////// Two flavour boundary det //////////////////// + TwoFlavourRatioPseudoFermionAction BdyNf2(DdwfDirichlet, DdwfPeriodic,CG,CG); + // ForceTest(BdyNf2,U,DDHMCFilter); + + //////////////////// Two flavour eo boundary det //////////////////// + TwoFlavourEvenOddRatioPseudoFermionAction BdyNf2eo(DdwfDirichlet, DdwfPeriodic,CG,CG); + // ForceTest(BdyNf2eo,U,DDHMCFilter); + + //////////////////// One flavour boundary det //////////////////// + OneFlavourRationalParams OFRp; // Up/down + OFRp.lo = 4.0e-5; + OFRp.hi = 90.0; + OFRp.MaxIter = 60000; + OFRp.tolerance= 1.0e-9; + OFRp.mdtolerance= 1.0e-8; + OFRp.degree = 18; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + std::vector ActionTolByPole({ + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8 + }); + std::vector MDTolByPole({ + 1.0e-5,5.0e-6,1.0e-6,1.0e-7, // soften convergence more more + // 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence more + // 1.0e-6,3.0e-7,1.0e-7,1.0e-7, // Orig sloppy + // 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8 + }); + OneFlavourEvenOddRatioRationalPseudoFermionAction BdySqrt(DdwfDirichlet,DdwfPeriodic,OFRp); + BdySqrt.SetTolerances(ActionTolByPole,MDTolByPole); + ForceTest(BdySqrt,U,DDHMCFilter); + + Grid_finalize(); +} From 140684d706009df5525de9f5e00a4496b5986ec1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Dec 2022 08:15:38 -0500 Subject: [PATCH 207/240] Head to head vs HMC --- HMC/Mobius2p1f_EOFA_96I_hmc.cc | 492 +++++++++++++++++++++++++++++++++ 1 file changed, 492 insertions(+) create mode 100644 HMC/Mobius2p1f_EOFA_96I_hmc.cc diff --git a/HMC/Mobius2p1f_EOFA_96I_hmc.cc b/HMC/Mobius2p1f_EOFA_96I_hmc.cc new file mode 100644 index 00000000..d27d558e --- /dev/null +++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc @@ -0,0 +1,492 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_hmc_EODWFRatio.cc + +Copyright (C) 2015-2016 + +Author: Peter Boyle +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +template + class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction { + public: + typedef typename FermionOperatorD::FermionField FieldD; + typedef typename FermionOperatorF::FermionField FieldF; + + using OperatorFunction::operator(); + + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + GridBase* SinglePrecGrid4; //Grid for single-precision fields + GridBase* SinglePrecGrid5; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + + FermionOperatorF &FermOpF; + FermionOperatorD &FermOpD;; + SchurOperatorF &LinOpF; + SchurOperatorD &LinOpD; + + Integer TotalInnerIterations; //Number of inner CG iterations + Integer TotalOuterIterations; //Number of restarts + Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + + MixedPrecisionConjugateGradientOperatorFunction(RealD tol, + Integer maxinnerit, + Integer maxouterit, + GridBase* _sp_grid4, + GridBase* _sp_grid5, + FermionOperatorF &_FermOpF, + FermionOperatorD &_FermOpD, + SchurOperatorF &_LinOpF, + SchurOperatorD &_LinOpD): + LinOpF(_LinOpF), + LinOpD(_LinOpD), + FermOpF(_FermOpF), + FermOpD(_FermOpD), + Tolerance(tol), + InnerTolerance(tol), + MaxInnerIterations(maxinnerit), + MaxOuterIterations(maxouterit), + SinglePrecGrid4(_sp_grid4), + SinglePrecGrid5(_sp_grid5), + OuterLoopNormMult(100.) + { + /* Debugging instances of objects; references are stored + std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " < &LinOpU, const FieldD &src, FieldD &psi) { + + std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<(&LinOpU); + + // std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <_Mat)<_Mat)==&(LinOpD._Mat)); + + //////////////////////////////////////////////////////////////////////////////////// + // Must snarf a single precision copy of the gauge field in Linop_d argument + //////////////////////////////////////////////////////////////////////////////////// + typedef typename FermionOperatorF::GaugeField GaugeFieldF; + typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF; + typedef typename FermionOperatorD::GaugeField GaugeFieldD; + typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD; + + GridBase * GridPtrF = SinglePrecGrid4; + GridBase * GridPtrD = FermOpD.Umu.Grid(); + GaugeFieldF U_f (GridPtrF); + GaugeLinkFieldF Umu_f(GridPtrF); + // std::cout << " Dim gauge field "<Nd()<Nd()<(FermOpD.Umu, mu); + precisionChange(Umu_f,Umu_d); + PokeIndex(FermOpF.Umu, Umu_f, mu); + } + pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); + pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); + + //////////////////////////////////////////////////////////////////////////////////// + // Make a mixed precision conjugate gradient + //////////////////////////////////////////////////////////////////////////////////// +#if 1 + RealD delta=1.e-4; + std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); +#else + std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" < MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); +#endif + MPCG(src,psi); + } + }; + +NAMESPACE_END(Grid); + + +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + + CartesianCommunicator::BarrierWorld(); + std::cout << GridLogMessage << " Clock skew check" < HMCWrapper; + // MD.name = std::string("Leap Frog"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("Force Gradient"); + //typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("MinimumNorm2"); + // TrajL = 2 + // 4/2 => 0.6 dH + // 3/3 => 0.8 dH .. depth 3, slower + //MD.MDsteps = 4; + MD.MDsteps = 3; + MD.trajL = 0.5; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 1; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("ColdStart"); + HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_DDHMC_lat"; + CPparams.rng_prefix = "ckpoint_DDHMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + std::cout << "loaded NERSC checpointer"< PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + Real beta = 2.13; + // Real light_mass = 5.4e-4; + Real light_mass = 7.8e-4; + Real strange_mass = 0.0362; + Real pv_mass = 1.0; + // std::vector hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + // std::vector hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + std::vector hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated + // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + typedef SchurDiagMooeeOperator LinearOperatorEOFAF; + typedef SchurDiagMooeeOperator LinearOperatorEOFAD; + typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG; + typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG_EOFA; + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + Coordinate CommDim(Nd); + for(int d=0;d1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + + ////////////////////////// + // Fermion Grids + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd()); + auto GridPtrF = SpaceTimeGrid::makeFourDimGrid(latt4,simdF,mpi); + auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF); + auto FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF); + auto FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeFieldD U(GridPtr); U=Zero(); + LatticeGaugeFieldF UF(GridPtrF); UF=Zero(); + LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero(); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + std::cout << "loaded NERSC gauge field"< boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + FermionActionF::ImplParams ParamsF(boundary); + Params.dirichlet=NonDirichlet; + ParamsF.dirichlet=NonDirichlet; + + // double StoppingCondition = 1e-14; + // double MDStoppingCondition = 1e-9; + double StoppingCondition = 1e-8; + double MDStoppingCondition = 1e-7; + double MDStoppingConditionLoose = 1e-7; + double MDStoppingConditionStrange = 1e-7; + double MaxCGIterations = 300000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(3); + ActionLevel Level3(15); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + // Probably dominates the force - back to EOFA. + OneFlavourRationalParams SFRp; + SFRp.lo = 0.25; + SFRp.hi = 25.0; + SFRp.MaxIter = 10000; + SFRp.tolerance= 1.0e-5; + SFRp.mdtolerance= 2.0e-4; + SFRp.degree = 8; + SFRp.precision= 50; + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ConjugateGradient ActionCG(StoppingCondition,MaxCGIterations); + ConjugateGradient DerivativeCG(MDStoppingCondition,MaxCGIterations); + LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); + LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); + LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF); + LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF); + + const int MX_inner = 1000; + MxPCG_EOFA ActionCGL(StoppingCondition, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_LF,Strange_Op_L, + Strange_LinOp_LF,Strange_LinOp_L); + + MxPCG_EOFA DerivativeCGL(MDStoppingConditionStrange, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_LF,Strange_Op_L, + Strange_LinOp_LF,Strange_LinOp_L); + + MxPCG_EOFA ActionCGR(StoppingCondition, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_RF,Strange_Op_R, + Strange_LinOp_RF,Strange_LinOp_R); + + MxPCG_EOFA DerivativeCGR(MDStoppingConditionStrange, + MX_inner, + MaxCGIterations, + GridPtrF, + FrbGridF, + Strange_Op_RF,Strange_Op_R, + Strange_LinOp_RF,Strange_LinOp_R); + + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + ActionCG, + ActionCGL, ActionCGR, + DerivativeCGL, DerivativeCGR, + SFRp, true); + // Level2.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + std::vector dirichlet_den; + std::vector dirichlet_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); dirichlet_den.push_back(0); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector NumeratorsF; + std::vector DenominatorsF; + std::vector NumeratorsD2; + std::vector DenominatorsD2; + std::vector *> Quotients; + std::vector ActionMPCG; + std::vector MPCG; + +#define MIXED_PRECISION +#ifdef MIXED_PRECISION + std::vector *> Bdys; +#else + std::vector *> Bdys; +#endif + + typedef SchurDiagMooeeOperator LinearOperatorF; + typedef SchurDiagMooeeOperator LinearOperatorD; + std::vector LinOpD; + std::vector LinOpF; + + for(int h=0;h(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); + } + int nquo=Quotients.size(); + for(int h=0;h Date: Sat, 17 Dec 2022 20:16:11 -0500 Subject: [PATCH 208/240] Simpler test for PETSc --- tests/core/Test_fft_matt.cc | 160 ++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 tests/core/Test_fft_matt.cc diff --git a/tests/core/Test_fft_matt.cc b/tests/core/Test_fft_matt.cc new file mode 100644 index 00000000..d4455a7e --- /dev/null +++ b/tests/core/Test_fft_matt.cc @@ -0,0 +1,160 @@ + /************************************************************************************* + + grid` physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_cshift.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds); // naughty seeding + GridParallelRNG pRNG(&GRID); + pRNG.SeedFixedIntegers(seeds); + + LatticeGaugeFieldD Umu(&GRID); + + SU::ColdConfiguration(pRNG,Umu); // Unit gauge + + //////////////////////////////////////////////////// + // Wilson test + //////////////////////////////////////////////////// + { + LatticeFermionD src(&GRID); gaussian(pRNG,src); + LatticeFermionD tmp(&GRID); + LatticeFermionD ref(&GRID); + + RealD mass=0.01; + WilsonFermionD Dw(Umu,GRID,RBGRID,mass); + + Dw.M(src,tmp); + + std::cout << "Dw src = " < HermOp(Dw); + ConjugateGradient CG(1.0e-10,10000); + CG(HermOp,src,result); + + //////////////////////////////////////////////////////////////////////// + std::cout << " Taking difference" < Date: Tue, 21 Feb 2023 10:52:42 -0500 Subject: [PATCH 209/240] Precision change improvements Added a new, much faster implementation of precision change that uses (optionally) a precomputed workspace containing pointer offsets that is device resident, such that all lattice copying occurs only on the device and no host<->device transfer is required, other than the pointer table. It also avoids the need to unpack and repack the fields using explicit lane copying. When this new precisionChange is called without a workspace, one will be computed on-the-fly; however it is still considerably faster than the original implementation. In the special case of using double2 and when the Grids are the same, calls to the new precisionChange will automatically use precisionChangeFast, such that there is a single API call for all precision changes. Reliable update and mixed-prec multishift have been modified to precompute precision change workspaces Renamed the original precisionChange as precisionChangeOrig Fixed incorrect pointer offset bug in copyLane Added a test and a benchmark for precisionChange Added a test for reliable update CG --- .../ConjugateGradientMultiShiftMixedPrec.h | 11 +- .../ConjugateGradientReliableUpdate.h | 32 ++- Grid/lattice/Lattice_transfer.h | 127 +++++++++++- Grid/tensors/Tensor_extract_merge.h | 6 +- benchmarks/Benchmark_prec_change.cc | 189 ++++++++++++++++++ tests/core/Test_prec_change.cc | 124 ++++++++++++ tests/solver/Test_dwf_relupcg_prec.cc | 143 +++++++++++++ 7 files changed, 616 insertions(+), 16 deletions(-) create mode 100644 benchmarks/Benchmark_prec_change.cc create mode 100644 tests/core/Test_prec_change.cc create mode 100644 tests/solver/Test_dwf_relupcg_prec.cc diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h index de1cfe01..a89a1e4a 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h @@ -130,6 +130,9 @@ public: GRID_TRACE("ConjugateGradientMultiShiftMixedPrec"); GridBase *DoublePrecGrid = src_d.Grid(); + precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid); + precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid); + //////////////////////////////////////////////////////////////////////// // Convenience references to the info stored in "MultiShiftFunction" //////////////////////////////////////////////////////////////////////// @@ -200,10 +203,10 @@ public: r_d = p_d; //MdagM+m[0] - precisionChangeFast(p_f,p_d); + precisionChange(p_f, p_d, pc_wk_d_to_s); Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) - precisionChangeFast(tmp_d,mmp_f); + precisionChange(tmp_d, mmp_f, pc_wk_s_to_d); Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) tmp_d = tmp_d - mmp_d; std::cout << " Testing operators match "< &Linop_f; LinearOperatorBase &Linop_d; GridBase* SinglePrecGrid; - RealD Delta; //reliable update parameter + RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single LinearOperatorBase *Linop_fallback; @@ -65,7 +65,9 @@ public: ErrorOnNoConverge(err_on_no_conv), DoFinalCleanup(true), Linop_fallback(NULL) - {}; + { + assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1"); + }; void setFallbackLinop(LinearOperatorBase &_Linop_fallback, const RealD _fallback_transition_tol){ Linop_fallback = &_Linop_fallback; @@ -116,9 +118,12 @@ public: } //Single prec initialization + precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid); + precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid()); + FieldF r_f(SinglePrecGrid); r_f.Checkerboard() = r.Checkerboard(); - precisionChange(r_f, r); + precisionChange(r_f, r, pc_wk_dp_to_sp); FieldF psi_f(r_f); psi_f = Zero(); @@ -134,7 +139,8 @@ public: GridStopWatch LinalgTimer; GridStopWatch MatrixTimer; GridStopWatch SolverTimer; - + GridStopWatch PrecChangeTimer; + SolverTimer.Start(); int k = 0; int l = 0; @@ -173,7 +179,9 @@ public: // Stopping condition if (cp <= rsq) { //Although not written in the paper, I assume that I have to add on the final solution - precisionChange(mmp, psi_f); + PrecChangeTimer.Start(); + precisionChange(mmp, psi_f, pc_wk_sp_to_dp); + PrecChangeTimer.Stop(); psi = psi + mmp; @@ -194,7 +202,10 @@ public: std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() < &in, Lattice &out) }); } +//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field) template void precisionChangeFast(Lattice &out, const Lattice &in) { @@ -1097,9 +1098,9 @@ void precisionChangeFast(Lattice &out, const Lattice &in) precisionChange(vout,vin,N); }); } -//Convert a Lattice from one precision to another +//Convert a Lattice from one precision to another (original, slow implementation) template -void precisionChange(Lattice &out, const Lattice &in) +void precisionChangeOrig(Lattice &out, const Lattice &in) { assert(out.Grid()->Nd() == in.Grid()->Nd()); for(int d=0;dNd();d++){ @@ -1145,6 +1146,128 @@ void precisionChange(Lattice &out, const Lattice &in) }); } +//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls +class precisionChangeWorkspace{ + std::pair* fmap_device; //device pointer + //maintain grids for checking + GridBase* _out_grid; + GridBase* _in_grid; +public: + precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){ + //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device + assert(out_grid->Nd() == in_grid->Nd()); + for(int d=0;dNd();d++){ + assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]); + } + int Nsimd_out = out_grid->Nsimd(); + + std::vector out_icorrs(out_grid->Nsimd()); //reuse these + for(int lane=0; lane < out_grid->Nsimd(); lane++) + out_grid->iCoorFromIindex(out_icorrs[lane], lane); + + std::vector > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd + thread_for(out_oidx,out_grid->oSites(),{ + Coordinate out_ocorr; + out_grid->oCoorFromOindex(out_ocorr, out_oidx); + + Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate) + for(int out_lane=0; out_lane < Nsimd_out; out_lane++){ + out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr); + + //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr); + //Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice + //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity + int in_oidx = 0, in_lane = 0; + for(int d=0;d_ndimension;d++){ + in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] ); + in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] ); + } + fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair( in_oidx, in_lane ); + } + }); + + //Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines) + size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair); + fmap_device = (std::pair*)acceleratorAllocDevice(fmap_bytes); + acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes); + } + + //Prevent moving or copying + precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete; + precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete; + precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete; + precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete; + + std::pair const* getMap() const{ return fmap_device; } + + void checkGrids(GridBase* out, GridBase* in) const{ + conformable(out, _out_grid); + conformable(in, _in_grid); + } + + ~precisionChangeWorkspace(){ + acceleratorFreeDevice(fmap_device); + } +}; + + +//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check) +//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery +template +auto _precisionChangeFastWrap(Lattice &out, const Lattice &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){ + if(out.Grid() == in.Grid()){ + precisionChangeFast(out,in); + return 1; + }else{ + return 0; + } +} +template +int _precisionChangeFastWrap(Lattice &out, const Lattice &in, long dummy){ //note long here is intentional; it means the above is preferred if available + return 0; +} + + +//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace +//which contains the mapping data. +template +void precisionChange(Lattice &out, const Lattice &in, const precisionChangeWorkspace &workspace){ + if(_precisionChangeFastWrap(out,in,0)) return; + + static_assert( std::is_same::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same + + out.Checkerboard() = in.Checkerboard(); + constexpr int Nsimd_out = VobjOut::Nsimd(); + + workspace.checkGrids(out.Grid(),in.Grid()); + std::pair const* fmap_device = workspace.getMap(); + + //Do the copy/precision change + autoView( out_v , out, AcceleratorWrite); + autoView( in_v , in, AcceleratorRead); + + accelerator_for(out_oidx, out.Grid()->oSites(), 1,{ + std::pair const* fmap_osite = fmap_device + out_oidx*Nsimd_out; + for(int out_lane=0; out_lane < Nsimd_out; out_lane++){ + int in_oidx = fmap_osite[out_lane].first; + int in_lane = fmap_osite[out_lane].second; + copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane); + } + }); +} + +//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast +//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device +template +void precisionChange(Lattice &out, const Lattice &in){ + if(_precisionChangeFastWrap(out,in,0)) return; + precisionChangeWorkspace workspace(out.Grid(), in.Grid()); + precisionChange(out, in, workspace); +} + + + + //////////////////////////////////////////////////////////////////////////////// // Communicate between grids //////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h index 87572faf..f1407d1f 100644 --- a/Grid/tensors/Tensor_extract_merge.h +++ b/Grid/tensors/Tensor_extract_merge.h @@ -226,7 +226,7 @@ template accelerator_inline void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in) { - static_assert( std::is_same::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same + static_assert( std::is_same::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same typedef typename vobjOut::vector_type ovector_type; typedef typename vobjIn::vector_type ivector_type; @@ -251,9 +251,9 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest ovector_type * __restrict__ op = (ovector_type *)&vecOut; ivector_type * __restrict__ ip = (ivector_type *)&vecIn; for(int w=0;w +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int Ls = 12; + Coordinate latt4 = GridDefaultLatt(); + + GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD); + GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD); + GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD); + + GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF); + GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF); + GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF); + + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGridD); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionD field_d(FGridD), tmp_d(FGridD); + random(RNG5,field_d); tmp_d = field_d; + + LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF); + precisionChange(field_d2, field_d); tmp_d2 = field_d2; + + LatticeFermionF field_f(FGridF), tmp_f(FGridF); + precisionChange(field_f, field_d); tmp_f = field_f; + + int N = 500; + + double time_ds = 0, time_sd = 0; + + std::cout<double original implementation (fields initially device-resident)" << std::endl; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + + precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid()); + precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid()); + + std::cout<double with pregenerated workspace(fields initially device-resident)" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + std::cout<double with workspace generated on-the-fly (fields initially device-resident)" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + + std::cout<double2 (fields initially device-resident)" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + + std::cout<double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl; + time_sd = time_ds = 0; + for(int i=0;is " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl; + + Grid_finalize(); +} diff --git a/tests/core/Test_prec_change.cc b/tests/core/Test_prec_change.cc new file mode 100644 index 00000000..06b9ae5c --- /dev/null +++ b/tests/core/Test_prec_change.cc @@ -0,0 +1,124 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/core/Test_prec_change.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int Ls = 12; + Coordinate latt4 = GridDefaultLatt(); + + GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD); + GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD); + GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD); + + GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF); + GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF); + GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF); + + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG5F(FGridF); RNG5F.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionD field_d(FGridD), tmp_d(FGridD); + random(RNG5,field_d); + RealD norm2_field_d = norm2(field_d); + + LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF); + random(RNG5F,field_d2); + RealD norm2_field_d2 = norm2(field_d2); + + LatticeFermionF field_f(FGridF); + + //Test original implementation + { + std::cout << GridLogMessage << "Testing original implementation" << std::endl; + field_f = Zero(); + precisionChangeOrig(field_f,field_d); + RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d = Zero(); + precisionChangeOrig(tmp_d, field_f); + Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + //Test new implementation with pregenerated workspace + { + std::cout << GridLogMessage << "Testing new implementation with pregenerated workspace" << std::endl; + precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid()); + precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid()); + + field_f = Zero(); + precisionChange(field_f,field_d,wk_dp_to_sp); + RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d = Zero(); + precisionChange(tmp_d, field_f,wk_sp_to_dp); + Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + //Test new implementation without pregenerated workspace + { + std::cout << GridLogMessage << "Testing new implementation without pregenerated workspace" << std::endl; + field_f = Zero(); + precisionChange(field_f,field_d); + RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d = Zero(); + precisionChange(tmp_d, field_f); + Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + //Test fast implementation + { + std::cout << GridLogMessage << "Testing fast (double2) implementation" << std::endl; + field_f = Zero(); + precisionChangeFast(field_f,field_d2); + RealD Ndiff = (norm2_field_d2 - norm2(field_f))/norm2_field_d2; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl; + tmp_d2 = Zero(); + precisionChangeFast(tmp_d2, field_f); + Ndiff = norm2( LatticeFermionD2(tmp_d2-field_d2) ) / norm2_field_d2; + std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl; + } + std::cout << "Done" << std::endl; + + Grid_finalize(); +} diff --git a/tests/solver/Test_dwf_relupcg_prec.cc b/tests/solver/Test_dwf_relupcg_prec.cc new file mode 100644 index 00000000..1d8c022a --- /dev/null +++ b/tests/solver/Test_dwf_relupcg_prec.cc @@ -0,0 +1,143 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/solver/Test_dwf_relupcg_prec.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + double relup_delta = 0.2; + for(int i=1;i> relup_delta; + std::cout << GridLogMessage << "Set reliable update Delta to " << relup_delta << std::endl; + } + } + + const int Ls=12; + + { + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + GridCartesian * UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f); + GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f); + GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeFermionD src(FGrid); random(RNG5,src); + LatticeFermionD result(FGrid); result=Zero(); + LatticeGaugeFieldD Umu(UGrid); + LatticeGaugeFieldF Umu_f(UGrid_f); + + SU::HotConfiguration(RNG4,Umu); + + precisionChange(Umu_f,Umu); + + RealD mass=0.1; + RealD M5=1.8; + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5); + + LatticeFermionD src_o(FrbGrid); + LatticeFermionD result_o(FrbGrid); + LatticeFermionD result_o_2(FrbGrid); + pickCheckerboard(Odd,src_o,src); + result_o.Checkerboard() = Odd; + result_o = Zero(); + result_o_2.Checkerboard() = Odd; + result_o_2 = Zero(); + + SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); + + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; + ConjugateGradientReliableUpdate mCG(1e-8, 10000, relup_delta, FrbGrid_f, HermOpEO_f, HermOpEO); + double t1,t2,flops; + double MdagMsiteflops = 1452; // Mobius (real coeffs) + // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of) + double CGsiteflops = (8+4+8+4+4)*Nc*Ns ; + std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops< CG(1.0e-8,10000); + for(int i=0;i<1;i++){ + result_o_2 = Zero(); + t1=usecond(); + CG(HermOpEO,src_o,result_o_2); + t2=usecond(); + iters = CG.IterationsToComplete; + flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; + flops+= CGsiteflops*FrbGrid->gSites()*iters; + + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< Date: Thu, 23 Feb 2023 09:45:29 -0500 Subject: [PATCH 210/240] Further prec-change improvements Mixed prec CG algorithm has been modified to precompute precision change workspaces As the original Test_dwf_mixedcg_prec has been coopted to do a performance stability and reproducibility test, requiring the single-prec CG to be run 200 times, I have created a new version of Test_dwf_mixedcg_prec in the solver subdirectory that just does the mixed vs double CG test --- .../iterative/ConjugateGradientMixedPrec.h | 9 +- tests/solver/Test_dwf_mixedcg_prec.cc | 122 ++++++++++++++++++ 2 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 tests/solver/Test_dwf_mixedcg_prec.cc diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h index 31ac55e0..27fee791 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h @@ -108,7 +108,10 @@ NAMESPACE_BEGIN(Grid); GridStopWatch PrecChangeTimer; Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count - + + precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid); + precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid); + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ //Compute double precision rsd and also new RHS vector. Linop_d.HermOp(sol_d, tmp_d); @@ -123,7 +126,7 @@ NAMESPACE_BEGIN(Grid); while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? PrecChangeTimer.Start(); - precisionChange(src_f, src_d); + precisionChange(src_f, src_d, pc_wk_dp_to_sp); PrecChangeTimer.Stop(); sol_f = Zero(); @@ -142,7 +145,7 @@ NAMESPACE_BEGIN(Grid); //Convert sol back to double and add to double prec solution PrecChangeTimer.Start(); - precisionChange(tmp_d, sol_f); + precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp); PrecChangeTimer.Stop(); axpy(sol_d, 1.0, tmp_d, sol_d); diff --git a/tests/solver/Test_dwf_mixedcg_prec.cc b/tests/solver/Test_dwf_mixedcg_prec.cc new file mode 100644 index 00000000..dc88018e --- /dev/null +++ b/tests/solver/Test_dwf_mixedcg_prec.cc @@ -0,0 +1,122 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_cg_prec.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +//using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=12; + + std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + GridCartesian * UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f); + GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f); + GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeFermionD src(FGrid); random(RNG5,src); + LatticeFermionD result(FGrid); result=Zero(); + LatticeGaugeFieldD Umu(UGrid); + LatticeGaugeFieldF Umu_f(UGrid_f); + + SU::HotConfiguration(RNG4,Umu); + + precisionChange(Umu_f,Umu); + + RealD mass=0.1; + RealD M5=1.8; + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5); + + LatticeFermionD src_o(FrbGrid); + LatticeFermionD result_o(FrbGrid); + LatticeFermionD result_o_2(FrbGrid); + pickCheckerboard(Odd,src_o,src); + result_o.Checkerboard() = Odd; + result_o = Zero(); + result_o_2.Checkerboard() = Odd; + result_o_2 = Zero(); + + SchurDiagMooeeOperator HermOpEO(Ddwf); + SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); + + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; + MixedPrecisionConjugateGradient mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); + double t1,t2,flops; + double MdagMsiteflops = 1452; // Mobius (real coeffs) + // CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of) + double CGsiteflops = (8+4+8+4+4)*Nc*Ns ; + std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops< CG(1.0e-8,10000); + result_o_2 = Zero(); + t1=usecond(); + CG(HermOpEO,src_o,result_o_2); + t2=usecond(); + iters = CG.IterationsToComplete; + flops = MdagMsiteflops*4*FrbGrid->gSites()*iters; + flops+= CGsiteflops*FrbGrid->gSites()*iters; + + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< Date: Thu, 23 Feb 2023 13:09:45 -0500 Subject: [PATCH 211/240] Fixed compile bug in MemoryManagerShared caused by Audit function not being passed a string --- Grid/allocator/MemoryManagerShared.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 2434ad47..e291ef89 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -13,7 +13,7 @@ uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; -void MemoryManager::Audit(void){}; +void MemoryManager::Audit(std::string s){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; int MemoryManager::isOpen (void* CpuPtr) { return 0;} From 87697eb07ebaf510a776f79bc409efebc7860a23 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 09:07:36 -0700 Subject: [PATCH 212/240] SHared compile --- Grid/allocator/MemoryManagerShared.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 2434ad47..e291ef89 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -13,7 +13,7 @@ uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; -void MemoryManager::Audit(void){}; +void MemoryManager::Audit(std::string s){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; int MemoryManager::isOpen (void* CpuPtr) { return 0;} From cad5b187ddcbce278bdca06b42dfb4ac6bbc59d0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 09:08:16 -0700 Subject: [PATCH 213/240] Cleanup --- Grid/qcd/action/fermion/WilsonKernels.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 68422f28..2d868c27 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -52,13 +52,6 @@ public: typedef AcceleratorVector StencilVector; public: -#ifdef GRID_SYCL -#define SYCL_HACK -#endif -#ifdef SYCL_HACK - static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf, - int ss,int sU,const SiteSpinor *in, SiteSpinor *out); -#endif static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, From eeb6e0a6e371407ec4d445f7d0459e6a951c2449 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 09:10:27 -0700 Subject: [PATCH 214/240] Renable cache blocking and efficient UPI type SHM comms --- .../WilsonFermion5DImplementation.h | 4 + .../WilsonFermionImplementation.h | 3 + .../WilsonKernelsImplementation.h | 3 +- Grid/stencil/Stencil.h | 104 ++++++++++++++++-- configure.ac | 8 ++ 5 files changed, 112 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 4ca24789..1ddb30ba 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -63,6 +63,10 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, _tmp(&FiveDimRedBlackGrid), Dirichlet(0) { + Stencil.lo = &Lebesgue; + StencilEven.lo = &LebesgueEvenOdd; + StencilOdd.lo = &LebesgueEvenOdd; + // some assertions assert(FiveDimGrid._ndimension==5); assert(FourDimGrid._ndimension==4); diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 2833fdc4..1a262533 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -60,6 +60,9 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, _tmp(&Hgrid), anisotropyCoeff(anis) { + Stencil.lo = &Lebesgue; + StencilEven.lo = &LebesgueEvenOdd; + StencilOdd.lo = &LebesgueEvenOdd; // Allocate the required comms buffer ImportGauge(_Umu); if (anisotropyCoeff.isAnisotropic){ diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index bdba7cb2..b307fad4 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -433,7 +433,8 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S }); #define ASM_CALL(A) \ - thread_for( ss, Nsite, { \ + thread_for( sss, Nsite, { \ + int ss = st.lo->Reorder(sss); \ int sU = ss; \ int sF = ss*Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index cffede12..a74b720d 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -290,9 +290,9 @@ public: protected: GridBase * _grid; - public: GridBase *Grid(void) const { return _grid; } + LebesgueOrder *lo; //////////////////////////////////////////////////////////////////////// // Needed to conveniently communicate gparity parameters into GPU memory @@ -337,6 +337,7 @@ public: //////////////////////////////////////// // Stencil query //////////////////////////////////////// +#ifdef SHM_FAST_PATH inline int SameNode(int point) { int dimension = this->_directions[point]; @@ -356,7 +357,40 @@ public: if ( displacement == 0 ) return 1; return 0; } +#else + // + inline int SameNode(int point) { + int dimension = this->_directions[point]; + int displacement = this->_distances[point]; + + int pd = _grid->_processors[dimension]; + int fd = _grid->_fdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + int recv_from_rank; + int xmit_to_rank; + + if ( ! comm_dim ) return 1; + + int nbr_proc; + if (displacement>0) nbr_proc = 1; + else nbr_proc = pd-1; + + // FIXME this logic needs to be sorted for three link term + // assert( (displacement==1) || (displacement==-1)); + // Present hack only works for >= 4^4 subvol per node + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p); + + if ( (shm==NULL) ) return 0; + return 1; + } +#endif ////////////////////////////////////////// // Comms packet queue for asynch thread // Use OpenMP Tasks for cleaner ??? @@ -1056,7 +1090,7 @@ public: int comms_recv = this->_comms_recv[point]; int comms_partial_send = this->_comms_partial_send[point] ; int comms_partial_recv = this->_comms_partial_recv[point] ; - + assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); @@ -1127,11 +1161,32 @@ public: recv_buf=this->u_recv_buf_p; } + // potential SHM fast path for intranode + int shm_send=0; + int shm_recv=0; +#ifdef SHM_FAST_PATH + // Put directly in place if we can + send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); + if ( (send_buf==NULL) ) { + shm_send=0; + send_buf = this->u_send_buf_p; + } else { + shm_send=1; + } + void *test_ptr = _grid->ShmBufferTranslate(recv_from_rank,recv_buf); + if ( test_ptr != NULL ) shm_recv = 1; + // static int printed; + // if (!printed){ + // std::cout << " GATHER FAST PATH SHM "<u_send_buf_p; // Gather locally, must send assert(send_buf!=NULL); +#endif // std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - +#ifdef SHM_FAST_PATH + #warning STENCIL SHM FAST PATH SELECTED + // shm == receive pointer if offnode + // shm == Translate[send pointer] if on node -- my view of his send pointer + cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); + if (shm==NULL) { + shm = rp; + // we found a packet that comes from MPI and contributes to this shift. + // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil. + // Kernel will add the exterior_terms except if is_same_node. + // leg of stencil + shm_recv=0; + } else { + shm_recv=1; + } + rpointers[i] = shm; + // Test send side + void *test_ptr = (void *) _grid->ShmBufferTranslate(xmit_to_rank,sp); + if ( test_ptr != NULL ) shm_send = 1; + // static int printed; + // if (!printed){ + // std::cout << " GATHERSIMD FAST PATH SHM "<u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); } diff --git a/configure.ac b/configure.ac index 2e6199c7..fedca3fe 100644 --- a/configure.ac +++ b/configure.ac @@ -646,6 +646,14 @@ case ${ac_SHM_FORCE_MPI} in ;; *) ;; esac +############### force MPI in SMP +AC_ARG_ENABLE([shm-fast-path],[AS_HELP_STRING([--enable-shm-fast-path],[Allow kernels to remote copy over intranode])],[ac_SHM_FAST_PATH=${enable_shm_fast_path}],[ac_SHM_FAST_PATH=no]) +case ${ac_SHM_FAST_PATH} in + yes) + AC_DEFINE([SHM_FAST_PATH],[1],[SHM_FAST_PATH] ) + ;; + *) ;; +esac ############### communication type selection AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes]) From f36b87deb5451a3c1d47c4a25502319395478c3e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 12:09:00 -0700 Subject: [PATCH 215/240] syscall fix --- Grid/communicator/SharedMemoryMPI.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 4993a02e..9a273dc4 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -29,6 +29,7 @@ Author: Christoph Lehner #include #include +#include #ifdef GRID_CUDA #include From 14cc142a1424d02883887572cef47d1677a91fb1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 12:09:26 -0700 Subject: [PATCH 216/240] Warning remove --- Grid/stencil/Stencil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index a74b720d..29aa876f 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -387,7 +387,7 @@ public: void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p); - if ( (shm==NULL) ) return 0; + if ( shm==NULL ) return 0; return 1; } #endif From 861e5d7f4c0bb56cbb5de550cb2a14e5c1ad15f7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 12:10:02 -0700 Subject: [PATCH 217/240] SYCL version update. Why do they keep making incompatible changes --- Grid/threads/Accelerator.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 5ac36d15..04ae885b 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -248,17 +248,23 @@ inline int acceleratorIsCommunicable(void *ptr) ////////////////////////////////////////////// // SyCL acceleration ////////////////////////////////////////////// -#ifdef GRID_SYCL -NAMESPACE_END(Grid); -#include -#include +#ifdef GRID_SYCL #define GRID_SYCL_LEVEL_ZERO_IPC -#ifdef GRID_SYCL_LEVEL_ZERO_IPC +NAMESPACE_END(Grid); +#if 0 +#include +#include #include #include +#else +#include +#include +#include +#include #endif + NAMESPACE_BEGIN(Grid); extern cl::sycl::queue *theGridAccelerator; From a997d24743abb33c2ec1778bafd805793d1919f7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 12:10:31 -0700 Subject: [PATCH 218/240] Remove nofma --- systems/PVC/config-command | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/PVC/config-command b/systems/PVC/config-command index 3f5b5993..dc6b222c 100644 --- a/systems/PVC/config-command +++ b/systems/PVC/config-command @@ -11,5 +11,5 @@ INSTALL=/nfs/site/home/azusayax/install --enable-unified=yes \ CXX=mpicxx \ LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ - CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare" + CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare" From e1c326558a54a38e138e9eeaead1aaabaab57440 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 21 Mar 2023 08:53:56 -0700 Subject: [PATCH 219/240] COmms improvements --- Grid/qcd/action/fermion/WilsonCompressor.h | 34 ++++++++++--------- .../WilsonKernelsImplementation.h | 11 ++++++ Grid/stencil/Stencil.h | 4 +-- Grid/threads/Accelerator.h | 4 +-- systems/PVC/benchmarks/run-1tile.sh | 2 +- systems/PVC/benchmarks/run-2tile-mpi.sh | 4 +-- systems/PVC/benchmarks/wrap.sh | 10 +++--- systems/PVC/config-command | 6 ++-- systems/PVC/setup.sh | 3 +- 9 files changed, 46 insertions(+), 32 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index fd1bbe89..5523ae8a 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -484,24 +484,26 @@ public: int dag = compress.dag; int face_idx=0; +#define vet_same_node(a,b) \ + { auto tmp = b; } if ( dag ) { - assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); - assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); - assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); - assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); - assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); - assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); - assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); - assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); + vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx)); + vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx)); } else { - assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); - assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); - assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); - assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); - assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); - assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); - assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); - assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); + vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx)); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index b307fad4..fcf1f1f3 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -439,6 +439,17 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S int sF = ss*Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ }); +#define ASM_CALL_SLICE(A) \ + auto grid = in.Grid() ; \ + int nt = grid->LocalDimensions()[4]; \ + int nxyz = Nsite/nt ; \ + for(int t=0;t::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ + });} template void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 29aa876f..c8703b9f 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -358,7 +358,7 @@ public: return 0; } #else - // + // fancy calculation for shm code inline int SameNode(int point) { int dimension = this->_directions[point]; @@ -378,7 +378,7 @@ public: int nbr_proc; if (displacement>0) nbr_proc = 1; - else nbr_proc = pd-1; + else nbr_proc = pd-1; // FIXME this logic needs to be sorted for three link term // assert( (displacement==1) || (displacement==-1)); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 04ae885b..2dde1433 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -305,14 +305,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { }); \ }); -#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); theGridAccelerator->wait(); } +#define accelerator_barrier(dummy) { theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); } +inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh index 923afd84..3c594ab6 100644 --- a/systems/PVC/benchmarks/run-1tile.sh +++ b/systems/PVC/benchmarks/run-1tile.sh @@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 export ZE_AFFINITY_MASK=0.0 -mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8 +mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 export ZE_AFFINITY_MASK=0 export I_MPI_OFFLOAD_CELL=device diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh index 9db0b66b..fa56d5ec 100755 --- a/systems/PVC/benchmarks/run-2tile-mpi.sh +++ b/systems/PVC/benchmarks/run-2tile-mpi.sh @@ -20,7 +20,7 @@ export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 -mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log +#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log -mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh index a352fff9..0e48625b 100755 --- a/systems/PVC/benchmarks/wrap.sh +++ b/systems/PVC/benchmarks/wrap.sh @@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK -if [ $MPI_LOCALRANKID = "0" ] -then +#if [ $MPI_LOCALRANKID = "0" ] +#then # ~psteinbr/build_pti/ze_tracer -c $@ - onetrace --chrome-kernel-timeline $@ -else +# onetrace --chrome-kernel-timeline $@ +#else $@ -fi +#fi diff --git a/systems/PVC/config-command b/systems/PVC/config-command index dc6b222c..7549f2b4 100644 --- a/systems/PVC/config-command +++ b/systems/PVC/config-command @@ -1,4 +1,4 @@ -INSTALL=/nfs/site/home/azusayax/install +INSTALL=/nfs/site/home/paboylx/prereqs/ ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ @@ -8,8 +8,8 @@ INSTALL=/nfs/site/home/azusayax/install --disable-fermion-reps \ --enable-shm=nvlink \ --enable-accelerator=sycl \ - --enable-unified=yes \ + --enable-unified=no \ CXX=mpicxx \ LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ - CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare" + CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -Wno-tautological-constant-compare -I$INSTALL/include" diff --git a/systems/PVC/setup.sh b/systems/PVC/setup.sh index 2a6f920b..9b515a62 100644 --- a/systems/PVC/setup.sh +++ b/systems/PVC/setup.sh @@ -1,5 +1,6 @@ export https_proxy=http://proxy-chain.intel.com:911 -export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH +#export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH module load intel-release source /opt/intel/oneapi/PVC_setup.sh From 0f2b7864362b92737fb011244c4f74d7cf4e3c62 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 21 Mar 2023 15:36:11 -0400 Subject: [PATCH 220/240] Vector -> vector --- Grid/stencil/SimpleCompressor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index ccbfdb29..dabd70a6 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -36,7 +36,7 @@ public: } template static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, + std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { assert( (table.size()&0x1)==0); From 88e218e8ee2bc1840aebcb5b7fcb9621add1f2ab Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 21 Mar 2023 15:37:58 -0400 Subject: [PATCH 221/240] Stencil updates --- Grid/stencil/Stencil.cc | 21 +++++++++++++++++++++ Grid/stencil/Stencil.h | 14 ++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/Grid/stencil/Stencil.cc b/Grid/stencil/Stencil.cc index c1b33baa..27dc75ed 100644 --- a/Grid/stencil/Stencil.cc +++ b/Grid/stencil/Stencil.cc @@ -29,6 +29,27 @@ NAMESPACE_BEGIN(Grid); +uint64_t DslashFullCount; +uint64_t DslashPartialCount; +uint64_t DslashDirichletCount; + +void DslashResetCounts(void) +{ + DslashFullCount=0; + DslashPartialCount=0; + DslashDirichletCount=0; +} +void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full) +{ + dirichlet = DslashDirichletCount; + partial = DslashPartialCount; + full = DslashFullCount; +} +void DslashLogFull(void) { DslashFullCount++;} +void DslashLogPartial(void) { DslashPartialCount++;} +void DslashLogDirichlet(void){ DslashDirichletCount++;} + + void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table) { diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 93a3cb68..356cf73f 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -120,6 +120,12 @@ void Gather_plane_exchange_table(commVector >& table, } */ +void DslashResetCounts(void); +void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); +void DslashLogFull(void); +void DslashLogPartial(void); +void DslashLogDirichlet(void); + struct StencilEntry { #ifdef GRID_CUDA uint64_t _byte_offset; // 8 bytes @@ -312,6 +318,7 @@ public: int face_table_computed; int partialDirichlet; + int fullDirichlet; std::vector > > face_table ; Vector surface_list; @@ -408,6 +415,9 @@ public: void CommunicateComplete(std::vector > &reqs) { _grid->StencilSendToRecvFromComplete(MpiReqs,0); + if ( this->partialDirichlet ) DslashLogPartial(); + else if ( this->fullDirichlet ) DslashLogDirichlet(); + else DslashLogFull(); } //////////////////////////////////////////////////////////////////////// // Blocking send and receive. Either sequential or parallel. @@ -736,6 +746,10 @@ public: if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); partialDirichlet = p.partialDirichlet; DirichletBlock(p.dirichlet); // comms send/recv set up + fullDirichlet=0; + for(int d=0;d Date: Tue, 21 Mar 2023 15:38:39 -0400 Subject: [PATCH 222/240] AUdit fix --- Grid/allocator/MemoryManagerShared.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 2434ad47..e291ef89 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -13,7 +13,7 @@ uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; -void MemoryManager::Audit(void){}; +void MemoryManager::Audit(std::string s){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; int MemoryManager::isOpen (void* CpuPtr) { return 0;} From 4135f2dcd1875816ade5a9b0194a32e07576afcb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 21 Mar 2023 15:41:41 -0400 Subject: [PATCH 223/240] Compressor --- Grid/qcd/action/fermion/WilsonCompressor.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index aaf612fc..a1234e3b 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -36,7 +36,7 @@ NAMESPACE_BEGIN(Grid); // Wilson compressor will need FaceGather policies for: // Periodic, Dirichlet, and partial Dirichlet for DWF /////////////////////////////////////////////////////////////// -const int dwf_compressor_depth=1; +const int dwf_compressor_depth=2; #define DWF_COMPRESS class FaceGatherPartialDWF { @@ -110,7 +110,7 @@ public: //////////////////////////////////////////////////////////////////////////////////////////// template static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, + std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { GridBase *Grid = rhs.Grid(); @@ -209,7 +209,7 @@ public: } template static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, + std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { // std::cout << " face gather exch DWF partial "< Date: Tue, 21 Mar 2023 15:58:49 -0400 Subject: [PATCH 224/240] Integrator update --- Grid/qcd/hmc/integrators/Integrator.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 85a135ad..38d6e561 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -284,6 +284,15 @@ public: << as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl; } } + std::cout << GridLogMessage << "--------------------------- "< Date: Tue, 21 Mar 2023 15:59:29 -0400 Subject: [PATCH 225/240] Test update --- tests/forces/Test_double_ratio.cc | 33 ++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tests/forces/Test_double_ratio.cc b/tests/forces/Test_double_ratio.cc index 0a350692..dc6713f9 100644 --- a/tests/forces/Test_double_ratio.cc +++ b/tests/forces/Test_double_ratio.cc @@ -476,6 +476,20 @@ int main (int argc, char ** argv) // ForceTest(BdyNf2eo,U,DDHMCFilter); //////////////////// One flavour boundary det //////////////////// + RationalActionParams OFRp; // Up/down + OFRp.lo = 6.0e-5; + OFRp.hi = 90.0; + OFRp.inv_pow = 2; + OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space + OFRp.action_tolerance= 1.0e-8; + OFRp.action_degree = 18; + OFRp.md_tolerance= 1.0e-5; + OFRp.md_degree = 14; + // OFRp.degree = 20; converges + // OFRp.degree = 16; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + /* OneFlavourRationalParams OFRp; // Up/down OFRp.lo = 4.0e-5; OFRp.hi = 90.0; @@ -485,6 +499,23 @@ int main (int argc, char ** argv) OFRp.degree = 18; OFRp.precision= 80; OFRp.BoundsCheckFreq=0; + */ + std::vector ActionTolByPole({ + 1.0e-7,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8 + }); + std::vector MDTolByPole({ + 1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more + // 1.0e-6,3.0e-7,1.0e-7,1.0e-7, + // 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8 + }); + /* std::vector ActionTolByPole({ 1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8, @@ -499,9 +530,9 @@ int main (int argc, char ** argv) // 1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8, - 1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8 }); + */ OneFlavourEvenOddRatioRationalPseudoFermionAction BdySqrt(DdwfDirichlet,DdwfPeriodic,OFRp); BdySqrt.SetTolerances(ActionTolByPole,MDTolByPole); ForceTest(BdySqrt,U,DDHMCFilter); From 8b43be39c0fe73864e821ae0fa93863d97d5044e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 21 Mar 2023 16:00:52 -0400 Subject: [PATCH 226/240] Config command --- systems/Crusher/config-command | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index 3965767f..d310ff55 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -1,12 +1,13 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` ../../configure --enable-comms=mpi-auto \ --with-lime=$CLIME \ ---enable-unified=no \ +--enable-unified=yes \ --enable-shm=nvlink \ --enable-tracing=timer \ --enable-accelerator=hip \ --enable-gen-simd-width=64 \ --enable-simd=GPU \ +--disable-accelerator-cshift \ --with-gmp=$OLCF_GMP_ROOT \ --with-fftw=$FFTW_DIR/.. \ --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ From c6621806ca73b9384b3bcae6cc8ef47157262be7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 21 Mar 2023 17:27:09 -0400 Subject: [PATCH 227/240] Compiling on laptop and running --- Grid/qcd/action/fermion/WilsonCompressor.h | 4 ++-- Grid/stencil/SimpleCompressor.h | 2 +- tests/Test_dwf_mixedcg_prec.cc | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 6e46e0b5..b2c07d18 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -110,7 +110,7 @@ public: //////////////////////////////////////////////////////////////////////////////////////////// template static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, + std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { GridBase *Grid = rhs.Grid(); @@ -209,7 +209,7 @@ public: } template static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, + std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { // std::cout << " face gather exch DWF partial "< static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, + std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { assert( (table.size()&0x1)==0); diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index 1e6da515..cbc573d1 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -101,7 +101,7 @@ int main (int argc, char ** argv) std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction, + public OperatorFunction +{ +public: + + using OperatorFunction::operator(); + + RealD Tolerance; + Integer MaxIterationsMshift; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + std::vector IterationsToCompleteShift; // Iterations for this shift + int verbose; + MultiShiftFunction shifts; + std::vector TrueResidualShift; + + int ReliableUpdateFreq; //number of iterations between reliable updates + + GridBase* SinglePrecGrid; //Grid for single-precision fields + LinearOperatorBase &Linop_f; //single precision + + ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts, + GridBase* _SinglePrecGrid, LinearOperatorBase &_Linop_f, + int _ReliableUpdateFreq) : + MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq), + MaxIterations(20000) + { + verbose=1; + IterationsToCompleteShift.resize(_shifts.order); + TrueResidualShift.resize(_shifts.order); + } + + void operator() (LinearOperatorBase &Linop, const FieldD &src, FieldD &psi) + { + GridBase *grid = src.Grid(); + int nshift = shifts.order; + std::vector results(nshift,grid); + (*this)(Linop,src,results,psi); + } + void operator() (LinearOperatorBase &Linop, const FieldD &src, std::vector &results, FieldD &psi) + { + int nshift = shifts.order; + + (*this)(Linop,src,results); + + psi = shifts.norm*src; + for(int i=0;i &Linop_d, const FieldD &src_d, std::vector &psi_d) + { + GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup"); + GridBase *DoublePrecGrid = src_d.Grid(); + + //////////////////////////////////////////////////////////////////////// + // Convenience references to the info stored in "MultiShiftFunction" + //////////////////////////////////////////////////////////////////////// + int nshift = shifts.order; + + std::vector &mass(shifts.poles); // Make references to array in "shifts" + std::vector &mresidual(shifts.tolerances); + std::vector alpha(nshift,1.0); + + //Double precision search directions + FieldD p_d(DoublePrecGrid); + std::vector ps_f (nshift, SinglePrecGrid);// Search directions (single precision) + std::vector psi_f(nshift, SinglePrecGrid);// solutions (single precision) + + FieldD tmp_d(DoublePrecGrid); + FieldD r_d(DoublePrecGrid); + FieldF r_f(SinglePrecGrid); + FieldD mmp_d(DoublePrecGrid); + + assert(psi_d.size()==nshift); + assert(mass.size()==nshift); + assert(mresidual.size()==nshift); + + // dynamic sized arrays on stack; 2d is a pain with vector + RealD bs[nshift]; + RealD rsq[nshift]; + RealD rsqf[nshift]; + RealD z[nshift][2]; + int converged[nshift]; + + const int primary =0; + + //Primary shift fields CG iteration + RealD a,b,c,d; + RealD cp,bp,qq; //prev + + // Matrix mult fields + FieldF p_f(SinglePrecGrid); + FieldF mmp_f(SinglePrecGrid); + + // Check lightest mass + for(int s=0;s= mass[primary] ); + converged[s]=0; + } + + // Wire guess to zero + // Residuals "r" are src + // First search direction "p" is also src + cp = norm2(src_d); + + // Handle trivial case of zero src. + if( cp == 0. ){ + for(int s=0;s= rsq[s]){ + CleanupTimer.Start(); + std::cout< Linop_shift_d(Linop_d, mass[s]); + ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop Linop_shift_f(Linop_f, mass[s]); + + MixedPrecisionConjugateGradient cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); + cg(src_d, psi_d[s]); + + TrueResidualShift[s] = cg.TrueResidual; + CleanupTimer.Stop(); + } + } + + std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"< Date: Thu, 23 Mar 2023 15:39:30 -0400 Subject: [PATCH 232/240] Mshift update --- HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc | 34 +-- TODO | 2 + tests/forces/Test_bdy.cc | 305 +++++++++++++++++++++++++++ 3 files changed, 313 insertions(+), 28 deletions(-) create mode 100644 tests/forces/Test_bdy.cc diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc index 059a0f20..5572d11f 100644 --- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc +++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc @@ -164,11 +164,6 @@ int main(int argc, char **argv) { typedef MobiusEOFAFermionF FermionEOFAActionF; typedef typename FermionActionF::FermionField FermionFieldF; - typedef WilsonImplD2 FermionImplPolicyD2; - typedef MobiusFermionD2 FermionActionD2; - typedef MobiusEOFAFermionD2 FermionEOFAActionD2; - typedef typename FermionActionD2::FermionField FermionFieldD2; - typedef Grid::XmlReader Serialiser; //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: @@ -268,10 +263,8 @@ int main(int argc, char **argv) { typedef SchurDiagMooeeOperator LinearOperatorF; typedef SchurDiagMooeeOperator LinearOperatorD; - typedef SchurDiagMooeeOperator LinearOperatorD2; typedef SchurDiagMooeeOperator LinearOperatorEOFAF; typedef SchurDiagMooeeOperator LinearOperatorEOFAD; - typedef SchurDiagMooeeOperator LinearOperatorEOFAD2; typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG; typedef MixedPrecisionConjugateGradientOperatorFunction MxPCG_EOFA; @@ -324,7 +317,6 @@ int main(int argc, char **argv) { // temporarily need a gauge field LatticeGaugeFieldD U(GridPtr); U=Zero(); LatticeGaugeFieldF UF(GridPtrF); UF=Zero(); - LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero(); std::cout << GridLogMessage << " Running the HMC "<< std::endl; TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file @@ -428,7 +420,7 @@ int main(int argc, char **argv) { ActionCGL, ActionCGR, DerivativeCGL, DerivativeCGR, SFRp, true); - // Level2.push_back(&EOFA); + Level2.push_back(&EOFA); //////////////////////////////////// // up down action @@ -453,15 +445,13 @@ int main(int argc, char **argv) { std::vector Denominators; std::vector NumeratorsF; std::vector DenominatorsF; - std::vector NumeratorsD2; - std::vector DenominatorsD2; std::vector *> Quotients; std::vector ActionMPCG; std::vector MPCG; #define MIXED_PRECISION #ifdef MIXED_PRECISION - std::vector *> Bdys; + std::vector *> Bdys; #else std::vector *> Bdys; #endif @@ -536,27 +526,15 @@ int main(int argc, char **argv) { Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); } else { #ifdef MIXED_PRECISION - // Use the D2 data types and make them use same grid as single - FermionActionD2::ImplParams ParamsDenD2(boundary); - FermionActionD2::ImplParams ParamsNumD2(boundary); - - ParamsDenD2.dirichlet = ParamsDen.dirichlet; - ParamsDenD2.partialDirichlet = ParamsDen.partialDirichlet; - DenominatorsD2.push_back(new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenD2)); - - ParamsNumD2.dirichlet = ParamsNum.dirichlet; - ParamsNumD2.partialDirichlet = ParamsNum.partialDirichlet; - NumeratorsD2.push_back (new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumD2)); - - Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction( + Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction( *Numerators[h],*Denominators[h], *NumeratorsF[h],*DenominatorsF[h], - *NumeratorsD2[h],*DenominatorsD2[h], + *Numerators[h],*Denominators[h], OFRp, SP_iters) ); - Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction( + Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction( *Numerators[h],*Denominators[h], *NumeratorsF[h],*DenominatorsF[h], - *NumeratorsD2[h],*DenominatorsD2[h], + *Numerators[h],*Denominators[h], OFRp, SP_iters) ); #else Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); diff --git a/TODO b/TODO index a6d0f2ac..750deb55 100644 --- a/TODO +++ b/TODO @@ -2,6 +2,8 @@ - - Also faster non-atomic reduction - - Remaining PRs - - DDHMC + - - MixedPrec is the action eval, high precision + - - MixedPrecCleanup is the force eval, low precision ================= ================= diff --git a/tests/forces/Test_bdy.cc b/tests/forces/Test_bdy.cc new file mode 100644 index 00000000..c2c97d0d --- /dev/null +++ b/tests/forces/Test_bdy.cc @@ -0,0 +1,305 @@ +/* + + 2f Full det MdagM 10^6 force ~ 1.3e7 +rid : Message : 1767.283471 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 1767.283476 s : S1 : 1.52885e+09 +Grid : Message : 1767.283480 s : S2 : 1.52886e+09 +Grid : Message : 1767.283482 s : dS : 8877.34 +Grid : Message : 1767.283483 s : dSpred : 8877.7 +Grid : Message : 1767.283484 s : diff : -0.360484 +Grid : Message : 1767.283485 s : ********************************************************* + + 2f Full det MpcdagMpc 10^6 force ~ 1.8e6 +Grid : Message : 2399.576962 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 2399.576968 s : S1 : 1.52885e+09 +Grid : Message : 2399.576972 s : S2 : 1.52886e+09 +Grid : Message : 2399.576974 s : dS : 9728.49 +Grid : Message : 2399.576975 s : dSpred : 9726.58 +Grid : Message : 2399.576976 s : diff : 1.90683 +Grid : Message : 2399.576977 s : ********************************************************* + + 2f bdy MdagM 1500 force Force ~ 2800 +Grid : Message : 4622.385061 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 4622.385067 s : S1 : 1.52885e+09 +Grid : Message : 4622.385071 s : S2 : 1.52885e+09 +Grid : Message : 4622.385072 s : dS : 25.4944 +Grid : Message : 4622.385073 s : dSpred : 25.4672 +Grid : Message : 4622.385074 s : diff : 0.0271414 +Grid : Message : 4622.385075 s : ********************************************************* + + 2f bdy MpcdagMpc 10^6 force Force ~ 2200 +Grid : Message : 4622.385061 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +Grid : Message : 4622.385067 s : S1 : 1.52885e+09 +Grid : Message : 4622.385071 s : S2 : 1.52885e+09 +Grid : Message : 4622.385072 s : dS : 25.4944 +Grid : Message : 4622.385073 s : dSpred : 25.4672 +Grid : Message : 4622.385074 s : diff : 0.0271414 +Grid : Message : 4622.385075 s : ********************************************************* + + 1f Bdy Det + Optimisation log: looser rational AND MD tolerances sloppy +MobiusForce.221179 -- same as HMC. dS is mispredicted Forece ~2.8 +Grid : Message : 6582.258991 s : dS : 0.024478 +Grid : Message : 6582.258992 s : dSpred : 0.00791876 +Grid : Message : 6582.258994 s : diff : 0.0165592 + +MobiusForce.221193 -- tight rational AND MD tolerances to 1e-8 ~ 2.8 same +Grid : Message : 1964.939209 s : S1 : 7.64404e+08 +Grid : Message : 1964.939213 s : S2 : 7.64404e+08 +Grid : Message : 1964.939215 s : dS : -0.00775838 <--- too loose even on action +Grid : Message : 1964.939216 s : dSpred : -0.00416793 +Grid : Message : 1964.939217 s : diff : -0.00359045 + +MobiusForce.221394 -- looser rational, MD tol 1e-8 ~ 2.8 same +Grid : Message : 1198.346720 s : S1 : 764404649.48886 +Grid : Message : 1198.346760 s : S2 : 764404649.5133 +Grid : Message : 1198.346780 s : dS : 0.024440884590149 +Grid : Message : 1198.346800 s : dSpred : 0.0079145154465184 +Grid : Message : 1198.346810 s : diff : 0.016526369143631 + +MobiusForce.221394 -- tight rational, MD tol sloppy Force ~ 2.8 +Grid : Message : 2376.921950 s : S1 : 764404436.44069 +Grid : Message : 2376.921954 s : S2 : 764404436.43299 +Grid : Message : 2376.921956 s : dS : -0.0076971054077148 +Grid : Message : 2376.921958 s : dSpred : -0.0041610472282526 +Grid : Message : 2376.921959 s : diff : -0.0035360581794623 + +*/ + +// +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_double_ratio.cc + + Copyright (C) 2022 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +typedef MobiusFermionD FermionAction; +typedef WilsonImplD FimplD; +typedef WilsonImplD FermionImplPolicy; + +template +void ForceTest(Action &action,LatticeGaugeField & U,MomentumFilterBase &Filter) +{ + GridBase *UGrid = U.Grid(); + + std::vector seeds({1,2,3,5}); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + + LatticeColourMatrix Pmu(UGrid); + LatticeGaugeField P(UGrid); + LatticeGaugeField UdSdU(UGrid); + + std::cout << GridLogMessage << "*********************************************************"<(UdSdU,mu); + Pmu= PeekIndex(P,mu); + dS = dS - trace(Pmu*UdSdUmu)*eps*2.0*2.0; + } + ComplexD dSpred = sum(dS); + RealD diff = S2-S1-dSpred.real(); + + std::cout<< GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt_size[0]/mpi_layout[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt_size[1]/mpi_layout[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt_size[2]/mpi_layout[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt_size[3]/mpi_layout[3] * shm[3]; + + Coordinate Block4(Nd); + Block4[0] = Dirichlet[1]; + Block4[1] = Dirichlet[2]; + Block4[2] = Dirichlet[3]; + Block4[3] = Dirichlet[4]; + + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + FermionAction::ImplParams ParamsDir(boundary); + Params.dirichlet=NonDirichlet; + ParamsDir.dirichlet=Dirichlet; + ParamsDir.partialDirichlet=1; + + ///////////////////// Gauge Field and Gauge Forces //////////////////////////// + LatticeGaugeField U(UGrid); + + RealD beta=6.0; + WilsonGaugeActionR PlaqAction(beta); + IwasakiGaugeActionR RectAction(beta); + + MomentumFilterNone FilterNone; + ForceTest(PlaqAction,U,FilterNone); + ForceTest(RectAction,U,FilterNone); + + //////////////////////////////////// + // Action + //////////////////////////////////// + RealD mass=0.00078; + RealD pvmass=1.0; + RealD M5=1.8; + RealD b=1.5; + RealD c=0.5; + + // Double versions + FermionAction DdwfPeriodic(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,Params); + FermionAction PVPeriodic (U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,pvmass,M5,b,c,Params); + FermionAction DdwfDirichlet(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,ParamsDir); + + double StoppingCondition = 1.0e-8; + double MaxCGIterations = 50000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + + //////////////////// Two Flavour Determinant Ratio /////////////////////////////// + TwoFlavourRatioPseudoFermionAction Nf2(PVPeriodic, DdwfPeriodic,CG,CG); + // ForceTest(Nf2,U,FilterNone); + + //////////////////// Two Flavour Determinant force test Even Odd /////////////////////////////// + TwoFlavourEvenOddRatioPseudoFermionAction Nf2eo(PVPeriodic, DdwfPeriodic,CG,CG); + // ForceTest(Nf2eo,U,FilterNone); + + //////////////////// Domain forces //////////////////// + int Width=4; + DDHMCFilter DDHMCFilter(Block4,Width); + + //////////////////// Two flavour boundary det //////////////////// + TwoFlavourRatioPseudoFermionAction BdyNf2(DdwfDirichlet, DdwfPeriodic,CG,CG); + // ForceTest(BdyNf2,U,DDHMCFilter); + + //////////////////// Two flavour eo boundary det //////////////////// + TwoFlavourEvenOddRatioPseudoFermionAction BdyNf2eo(DdwfDirichlet, DdwfPeriodic,CG,CG); + // ForceTest(BdyNf2eo,U,DDHMCFilter); + + //////////////////// One flavour boundary det //////////////////// + OneFlavourRationalParams OFRp; // Up/down + OFRp.lo = 4.0e-5; + OFRp.hi = 90.0; + OFRp.MaxIter = 60000; + OFRp.tolerance= 1.0e-8; + OFRp.mdtolerance= 1.0e-6; + OFRp.degree = 18; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + std::vector ActionTolByPole({ + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8 + }); + std::vector MDTolByPole({ + 1.0e-6,3.0e-7,1.0e-7,1.0e-7, // Orig sloppy + // 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8 + }); + OneFlavourEvenOddRatioRationalPseudoFermionAction BdySqrt(DdwfDirichlet,DdwfPeriodic,OFRp); + ForceTest(BdySqrt,U,DDHMCFilter); + + Grid_finalize(); +} From d8a9a745d8117a924a4255422b8f984c14d511fd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 24 Mar 2023 15:40:30 -0400 Subject: [PATCH 233/240] stream synchronise --- Grid/lattice/Lattice_reduction_gpu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 4bdcce0b..bd83a1ea 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -217,19 +217,19 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi // which worked with earlier drivers. // Not sure which driver had first fail and this bears checking // Is awkward as must install multiple driver versions -#undef UVM_BLOCK_BUFFER +#undef UVM_BLOCK_BUFFER #ifndef UVM_BLOCK_BUFFER commVector buffer(numBlocks); sobj *buffer_v = &buffer[0]; sobj result; - reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); + reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); accelerator_barrier(); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); #else Vector buffer(numBlocks); sobj *buffer_v = &buffer[0]; sobj result; - reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); + reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); accelerator_barrier(); result = *buffer_v; #endif From 900e01f49bca04257ee979e1c7e97c2dc1e3cd9e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Mar 2023 21:35:06 -0700 Subject: [PATCH 234/240] Temporary --- benchmarks/Benchmark_dwf_fp32_paranoid.cc | 387 ++++++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 benchmarks/Benchmark_dwf_fp32_paranoid.cc diff --git a/benchmarks/Benchmark_dwf_fp32_paranoid.cc b/benchmarks/Benchmark_dwf_fp32_paranoid.cc new file mode 100644 index 00000000..20f23b60 --- /dev/null +++ b/benchmarks/Benchmark_dwf_fp32_paranoid.cc @@ -0,0 +1,387 @@ + /************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + Source file: ./benchmarks/Benchmark_dwf.cc + Copyright (C) 2015 + + Author: Peter Boyle + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + int threads = GridThread::GetThreads(); + + Coordinate latt4 = GridDefaultLatt(); + int Ls=16; + for(int i=0;i> Ls; + } + + GridLogLayout(); + + long unsigned int single_site_flops = 8*Nc*(7+16*Nc); + + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionF src (FGrid); random(RNG5,src); + LatticeFermionF src1 (FGrid); random(RNG5,src1); +#if 0 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4,Umu); + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; +#if 0 + Umu=1.0; + for(int mu=0;mu(Umu,mu); + // if (mu !=2 ) ttmp = 0; + // ttmp = ttmp* pow(10.0,mu); + PokeIndex(Umu,ttmp,mu); + } + std::cout << GridLogMessage << "Forced to diagonal " << std::endl; +#endif + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + // replicate across fifth dimension + // LatticeGaugeFieldF Umu5d(FGrid); + std::vector U(4,UGrid); + for(int mu=0;mu(Umu,mu); + } + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; + + if (1) + { + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); + Dw.Dhop(src,result,0); + std::cout<Barrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + + /* + std::cout << "RESULT\n " << result<Barrier(); + exit(-1); + } + assert (norm2(err)< 1.0e-4 ); + } + + if (1) + { // Naive wilson dag implementation + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s1.0e-4)){ +/* + std::cout<< "DAG RESULT\n " <Barrier(); + Dw.DhopEO(src_o,r_e,DaggerNo); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4)){ + /* + std::cout<< "Deo RESULT\n " < Date: Tue, 28 Mar 2023 08:34:24 -0700 Subject: [PATCH 235/240] Commet --- Grid/lattice/Lattice_reduction_gpu.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 4bdcce0b..b1139434 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -211,12 +211,9 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi assert(ok); Integer smemSize = numThreads * sizeof(sobj); - // UVM seems to be buggy under later CUDA drivers - // This fails on A100 and driver 5.30.02 / CUDA 12.1 - // Fails with multiple NVCC versions back to 11.4, - // which worked with earlier drivers. - // Not sure which driver had first fail and this bears checking - // Is awkward as must install multiple driver versions + // Move out of UVM + // Turns out I had messed up the synchronise after move to compute stream + // as running this on the default stream fools the synchronise #undef UVM_BLOCK_BUFFER #ifndef UVM_BLOCK_BUFFER commVector buffer(numBlocks); From 6af97069b93cb61503cad8770e4ccd6b7532ae98 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 28 Mar 2023 13:39:44 -0700 Subject: [PATCH 236/240] Preparing for close of feature/dirichlet Initial code change review complete --- .../WilsonKernelsImplementation.h | 4 ---- ...ayleyFermion5DInstantiationWilsonImplD2.cc | 1 - ...ctionFermion5DInstantiationWilsonImplD2.cc | 1 - ...allEOFAFermionInstantiationWilsonImplD2.cc | 1 - ...iusEOFAFermionInstantiationWilsonImplD2.cc | 1 - ...ctionFermion5DInstantiationWilsonImplD2.cc | 1 - ...nCloverFermionInstantiationWilsonImplD2.cc | 1 - ...ilsonFermion5DInstantiationWilsonImplD2.cc | 1 - .../WilsonFermionInstantiationWilsonImplD2.cc | 1 - .../WilsonKernelsInstantiationWilsonImplD2.cc | 1 - ...ilsonTMFermionInstantiationWilsonImplD2.cc | 1 - .../fermion/instantiation/WilsonImplD2/impl.h | 1 - ...yleyFermion5DInstantiationZWilsonImplD2.cc | 1 - ...tionFermion5DInstantiationZWilsonImplD2.cc | 1 - ...llEOFAFermionInstantiationZWilsonImplD2.cc | 1 - ...usEOFAFermionInstantiationZWilsonImplD2.cc | 1 - ...tionFermion5DInstantiationZWilsonImplD2.cc | 1 - ...lsonFermion5DInstantiationZWilsonImplD2.cc | 1 - ...WilsonKernelsInstantiationZWilsonImplD2.cc | 1 - .../instantiation/ZWilsonImplD2/impl.h | 1 - .../pseudofermion/TwoFlavourEvenOddRatio.h | 13 ------------ Grid/qcd/hmc/integrators/Integrator.h | 21 +++++++------------ 22 files changed, 8 insertions(+), 49 deletions(-) delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc delete mode 100644 Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc delete mode 100644 Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index fcf1f1f3..d7541054 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -463,11 +463,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} -#ifdef SYCL_HACK - if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; } -#else if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} -#endif #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 9cc05107..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 5f6ab65e..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index d5789bcf..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonTMFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h deleted file mode 100644 index a836ff03..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION WilsonImplD2 diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h deleted file mode 100644 index 067d6080..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION ZWilsonImplD2 diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h index 476b1c53..c0e2c1d3 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h @@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid); // NumOp == V // DenOp == M // - AUDIT(); FermionField etaOdd (NumOp.FermionRedBlackGrid()); FermionField etaEven(NumOp.FermionRedBlackGrid()); FermionField tmp (NumOp.FermionRedBlackGrid()); - AUDIT(); pickCheckerboard(Even,etaEven,eta); - AUDIT(); pickCheckerboard(Odd,etaOdd,eta); - AUDIT(); NumOp.ImportGauge(U); - AUDIT(); DenOp.ImportGauge(U); std::cout << " TwoFlavourRefresh: Imported gauge "< Mpc(DenOp); - AUDIT(); SchurDifferentiableOperator Vpc(NumOp); - AUDIT(); std::cout << " TwoFlavourRefresh: Diff ops "<deriv_timer_start(); as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta as[level].actions.at(a)->deriv_timer_stop(); std::cout << GridLogMessage << "AuditForce["<is_smeared << std::endl; auto name = as[level].actions.at(a)->action_name(); @@ -382,12 +380,12 @@ public: Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); std::cout << GridLogMessage << "AuditRefresh["<refresh_timer_start(); as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); as[level].actions.at(actionID)->refresh_timer_stop(); std::cout << GridLogMessage << "AuditRefresh["<is_smeared); @@ -434,7 +432,7 @@ public: as[level].actions.at(actionID)->S_timer_stop(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; - AUDIT(); + } as[level].apply(S_hireps, Representations, level, H); } @@ -447,9 +445,9 @@ public: void operator()(std::vector*> repr_set, Repr& Rep, int level, RealD& H) { for (int a = 0; a < repr_set.size(); ++a) { - AUDIT(); + RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); - AUDIT(); + std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; H += Hterm; @@ -474,10 +472,10 @@ public: Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; as[level].actions.at(actionID)->S_timer_start(); - AUDIT(); + Hterm = as[level].actions.at(actionID)->Sinitial(Us); as[level].actions.at(actionID)->S_timer_stop(); - AUDIT(); + std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; } @@ -490,7 +488,6 @@ public: void integrate(Field& U) { - AUDIT(); // reset the clocks t_U = 0; for (int level = 0; level < as.size(); ++level) { @@ -508,10 +505,8 @@ public: assert(fabs(t_U - t_P[level]) < 1.0e-6); // must be the same std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; } - AUDIT(); FieldImplementation::Project(U); - AUDIT(); // and that we indeed got to the end of the trajectory assert(fabs(t_U - Params.trajL) < 1.0e-6); From 4a261fab303b0338cb0529d83144860535155675 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 28 Mar 2023 20:04:21 -0700 Subject: [PATCH 237/240] Changes premerge to develop --- Grid/lattice/Lattice_rng.h | 11 +---------- .../implementation/WilsonKernelsImplementation.h | 9 +++++---- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index 180b8437..b7ef0e82 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -440,17 +440,8 @@ public: _grid->GlobalCoorToGlobalIndex(gcoor,gidx); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); -#if 1 - assert(rank == _grid->ThisRank() ); -#else -// - if (rank != _grid->ThisRank() ){ - std::cout <<"rank "<ThisRank() "<<_grid->ThisRank()<< std::endl; -// exit(-42); -// assert(0); - } -#endif + assert(rank == _grid->ThisRank() ); int l_idx=generator_idx(o_idx,i_idx); _generators[l_idx] = master_engine; diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index d7541054..ce7cd49c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -462,6 +462,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} #ifndef GRID_CUDA @@ -474,6 +475,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} #ifndef GRID_CUDA @@ -493,15 +495,15 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif - acceleratorFenceComputeStream(); } else if( interior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} - if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;} + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif @@ -512,7 +514,6 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif - acceleratorFenceComputeStream(); } assert(0 && " Kernel optimisation case not covered "); } From 7212432f431c6a8e784bbe77bcdf9f294fce4296 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 28 Mar 2023 20:10:22 -0700 Subject: [PATCH 238/240] More careful fencing --- .../action/fermion/implementation/WilsonKernelsImplementation.h | 2 -- Grid/stencil/Stencil.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index ce7cd49c..70e2477f 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -462,7 +462,6 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { - acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} #ifndef GRID_CUDA @@ -495,7 +494,6 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { - acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} #ifndef GRID_CUDA diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index c9287bf2..1568cbf9 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -665,9 +665,11 @@ public: for(int i=0;i Date: Wed, 29 Mar 2023 14:36:50 -0400 Subject: [PATCH 239/240] Compile fix on Nvidia --- Grid/communicator/SharedMemoryMPI.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 3248d328..3a70395c 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -38,9 +38,8 @@ Author: Christoph Lehner #include #endif #ifdef GRID_SYCL - -#endif #define GRID_SYCL_LEVEL_ZERO_IPC +#endif NAMESPACE_BEGIN(Grid); From c42e25e5b8b275aaf06d4f73e1fbed65533735bf Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 29 Mar 2023 16:25:52 -0400 Subject: [PATCH 240/240] Dirichlet remove --- tests/Test_cayley_even_odd_vec.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/Test_cayley_even_odd_vec.cc b/tests/Test_cayley_even_odd_vec.cc index e25d6fde..243d1f72 100644 --- a/tests/Test_cayley_even_odd_vec.cc +++ b/tests/Test_cayley_even_odd_vec.cc @@ -73,12 +73,12 @@ int main (int argc, char ** argv) RealD M5 =1.8; std::cout< boundary = {1,1,1,-1}; DomainWallFermionD::ImplParams Params(boundary); - Coordinate Dirichlet({0,8,8,16,32}); - Params.dirichlet=Dirichlet; + // Coordinate Dirichlet({0,8,8,16,32}); + // Params.dirichlet=Dirichlet; DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params); TestWhat(Ddwf,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5);