From c144b32368fdd8a0338a2c0c5c0c70675e43c048 Mon Sep 17 00:00:00 2001 From: James Richings Date: Tue, 26 Oct 2021 10:37:24 +0100 Subject: [PATCH 01/30] deflation timers --- Grid/algorithms/iterative/Deflation.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Grid/algorithms/iterative/Deflation.h b/Grid/algorithms/iterative/Deflation.h index 43fe3e35..4da8e037 100644 --- a/Grid/algorithms/iterative/Deflation.h +++ b/Grid/algorithms/iterative/Deflation.h @@ -70,12 +70,26 @@ public: } virtual void operator()(const Field &src,Field &guess) { + GridStopWatch w1; + GridStopWatch w2; + + w1.Start(); guess = Zero(); + w1.Stop(); + + LOG(Message) << "Zeroing the 'out' vector took: " << w1.Elapsed() << std::endl; + + w2.Start(); for (int i=0;i Date: Fri, 29 Oct 2021 13:01:31 +0100 Subject: [PATCH 02/30] reverse previous "fix", missing statement was probably intentional, added a comment to that effect --- Grid/parallelIO/IldgIO.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/parallelIO/IldgIO.h b/Grid/parallelIO/IldgIO.h index 9ed773dd..576f08bd 100644 --- a/Grid/parallelIO/IldgIO.h +++ b/Grid/parallelIO/IldgIO.h @@ -576,7 +576,8 @@ class ScidacReader : public GridLimeReader { std::string rec_name(ILDG_BINARY_DATA); while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) { - skipPastObjectRecord(std::string(GRID_FIELD_NORM)); + // in principle should do the line below, but that breaks backard compatibility with old data + // skipPastObjectRecord(std::string(GRID_FIELD_NORM)); skipPastObjectRecord(std::string(SCIDAC_CHECKSUM)); return; } From 2ad181164266000e04fa109b656ca1922b5ac3a4 Mon Sep 17 00:00:00 2001 From: James Richings Date: Tue, 9 Nov 2021 12:33:25 +0000 Subject: [PATCH 03/30] Added timing to deflation code. --- Grid/algorithms/iterative/Deflation.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/Grid/algorithms/iterative/Deflation.h b/Grid/algorithms/iterative/Deflation.h index 814a0432..83ec1ab0 100644 --- a/Grid/algorithms/iterative/Deflation.h +++ b/Grid/algorithms/iterative/Deflation.h @@ -75,24 +75,19 @@ public: virtual void operator()(const Field &src,Field &guess) { GridStopWatch w1; - GridStopWatch w2; - w1.Start(); guess = Zero(); - w1.Stop(); - LOG(Message) << "Zeroing the 'out' vector took: " << w1.Elapsed() << std::endl; - - w2.Start(); + w1.Start(); for (int i=0;i Date: Tue, 9 Nov 2021 12:57:09 +0000 Subject: [PATCH 04/30] Helper functions to allow probe of cache state of lattice objects. --- Grid/allocator/MemoryManager.h | 1 + Grid/allocator/MemoryManagerCache.cc | 29 +++++++++++++++++++++++++++ Grid/allocator/MemoryManagerShared.cc | 4 ++++ Grid/lattice/Lattice_base.h | 7 +++++++ 4 files changed, 41 insertions(+) diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index eafcd83f..740d8d92 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -170,6 +170,7 @@ private: public: static void Print(void); + static void PrintState( void* CpuPtr); static int isOpen (void* CpuPtr); static void ViewClose(void* CpuPtr,ViewMode mode); static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 72111dbd..61dc1125 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -474,6 +474,35 @@ int MemoryManager::isOpen (void* _CpuPtr) } } +void MemoryManager::PrintState(void* _CpuPtr) +{ + uint64_t CpuPtr = (uint64_t)_CpuPtr; + + if ( EntryPresent(CpuPtr) ){ + auto AccCacheIterator = EntryLookup(CpuPtr); + auto & AccCache = AccCacheIterator->second; + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + if ( AccCache.state==EvictNext) str = std::string("EvictNext"); + + std::cout << GridLogMessage << "--------------------------------------------" << std::endl; + std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<Device memory movement not currently managed by Grid." << std::endl; +}; void MemoryManager::Print(void){}; void MemoryManager::NotifyDeletion(void *ptr){}; diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 3ad9f913..9c3d723f 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -88,6 +88,13 @@ public: LatticeView accessor(*( (LatticeAccelerator *) this),mode); accessor.ViewClose(); } + + // Helper function to print the state of this object in the AccCache + void PrintCacheState(void) + { + MemoryManager::PrintState(this->_odata); + } + ///////////////////////////////////////////////////////////////////////////////// // Return a view object that may be dereferenced in site loops. // The view is trivially copy constructible and may be copied to an accelerator device From 829a32845171e444415827f3800d6d7d45b5c9ec Mon Sep 17 00:00:00 2001 From: James Richings Date: Tue, 9 Nov 2021 20:46:57 +0000 Subject: [PATCH 05/30] remove deflation timing --- Grid/algorithms/iterative/Deflation.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Grid/algorithms/iterative/Deflation.h b/Grid/algorithms/iterative/Deflation.h index 83ec1ab0..9f85ac70 100644 --- a/Grid/algorithms/iterative/Deflation.h +++ b/Grid/algorithms/iterative/Deflation.h @@ -74,21 +74,11 @@ public: } virtual void operator()(const Field &src,Field &guess) { - GridStopWatch w1; - guess = Zero(); - - w1.Start(); for (int i=0;i Date: Tue, 9 Nov 2021 21:20:36 +0000 Subject: [PATCH 06/30] fix to deflation.h --- Grid/algorithms/iterative/Deflation.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/algorithms/iterative/Deflation.h b/Grid/algorithms/iterative/Deflation.h index 9f85ac70..2eb28bf9 100644 --- a/Grid/algorithms/iterative/Deflation.h +++ b/Grid/algorithms/iterative/Deflation.h @@ -78,6 +78,7 @@ public: for (int i=0;i Date: Tue, 9 Nov 2021 21:56:23 +0000 Subject: [PATCH 07/30] Format edit --- Grid/allocator/MemoryManagerCache.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 61dc1125..04b3fe95 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -488,15 +488,12 @@ void MemoryManager::PrintState(void* _CpuPtr) if ( AccCache.state==Consistent)str = std::string("Consistent"); if ( AccCache.state==EvictNext) str = std::string("EvictNext"); - std::cout << GridLogMessage << "--------------------------------------------" << std::endl; std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "< Date: Mon, 22 Nov 2021 20:44:39 -0500 Subject: [PATCH 08/30] HIP improvements on messaging and intranode hipMemCopyAsynch --- Grid/threads/Accelerator.cc | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 9f27f12b..240758b3 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -84,7 +84,8 @@ void acceleratorInit(void) // IBM Jsrun makes cuda Device numbering screwy and not match rank if ( world_rank == 0 ) { printf("AcceleratorCudaInit: using default device \n"); - printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n"); + printf("AcceleratorCudaInit: assume user either uses\n"); + printf("AcceleratorCudaInit: a) IBM jsrun, or \n"); printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n"); printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n"); } @@ -109,6 +110,7 @@ void acceleratorInit(void) #ifdef GRID_HIP hipDeviceProp_t *gpu_props; +hipStream_t copyStream; void acceleratorInit(void) { int nDevices = 1; @@ -166,16 +168,25 @@ void acceleratorInit(void) #ifdef GRID_DEFAULT_GPU if ( world_rank == 0 ) { printf("AcceleratorHipInit: using default device \n"); - printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n"); - printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n"); + printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n"); + printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n"); } + int device = 0; #else if ( world_rank == 0 ) { printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank); - printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n"); + printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n"); } - hipSetDevice(rank); + int device = rank; #endif + hipSetDevice(device); + hipStreamCreate(©Stream); + const int len=64; + char busid[len]; + if( rank == world_rank ) { + hipDeviceGetPCIBusId(busid, len, device); + printf("local rank %d device %d bus id: %s\n", rank, device, busid); + } if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n"); } #endif From 6ceb55668420e843b984ed244953cdbc5612dfea Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 22 Nov 2021 20:45:12 -0500 Subject: [PATCH 09/30] Intranode asynch hipMemCopy --- Grid/threads/Accelerator.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index cec0600f..0dfe8c64 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -230,6 +230,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream); } inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); }; + inline int acceleratorIsCommunicable(void *ptr) { // int uvm=0; @@ -337,6 +338,7 @@ NAMESPACE_BEGIN(Grid); #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline +extern hipStream_t copyStream; /*These routines define mapping from thread grid to loop & vector lane indexing */ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT @@ -411,10 +413,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} -inline void acceleratorCopySynchronise(void) { } +//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} +//inline void acceleratorCopySynchronise(void) { } inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch +{ + hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream); +} +inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); }; + #endif ////////////////////////////////////////////// @@ -485,18 +493,12 @@ inline void acceleratorFreeCpu (void *ptr){free(ptr);}; /////////////////////////////////////////////////// // Synchronise across local threads for divergence resynch /////////////////////////////////////////////////// -accelerator_inline void acceleratorSynchronise(void) +accelerator_inline void acceleratorSynchronise(void) // Only Nvidia needs { #ifdef GRID_SIMT #ifdef GRID_CUDA __syncwarp(); #endif -#ifdef GRID_SYCL - //cl::sycl::detail::workGroupBarrier(); -#endif -#ifdef GRID_HIP - __syncthreads(); -#endif #endif return; } From 8079dc2a145226b455608e87646992e737638db3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 22 Nov 2021 20:45:44 -0500 Subject: [PATCH 10/30] Cray MPI not working right yet --- systems/Spock/comms.slurm | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 systems/Spock/comms.slurm diff --git a/systems/Spock/comms.slurm b/systems/Spock/comms.slurm new file mode 100644 index 00000000..0841fda0 --- /dev/null +++ b/systems/Spock/comms.slurm @@ -0,0 +1,26 @@ +#!/bin/bash +# Begin LSF Directives +#SBATCH -A LGT104 +#SBATCH -t 01:00:00 +##SBATCH -U openmpThu +#SBATCH -p ecp +#SBATCH -J comms +#SBATCH -o comms.%J +#SBATCH -e comms.%J +#SBATCH -N 1 +#SBATCH -n 2 + +DIR=. +module list +export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +export MPICH_GPU_SUPPORT_ENABLED=1 +#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +#export MPICH_SMP_SINGLE_COPY_MODE=CMA +export MPICH_SMP_SINGLE_COPY_MODE=NONE +export OMP_NUM_THREADS=8 + +AT=8 +echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE +PARAMS=" --accelerator-threads ${AT} --grid 64.64.32.32 --mpi 2.1.1.1 " +srun -n2 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_comms_host_device $PARAMS + From 2a4e739513dc41695b443a0464a6fe36a6aee231 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 22 Nov 2021 20:46:09 -0500 Subject: [PATCH 11/30] Enable XGMI copy (need to rename nvlink to cover NVLINK/XGMI/XeLink) --- systems/Spock/config-command | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 systems/Spock/config-command diff --git a/systems/Spock/config-command b/systems/Spock/config-command new file mode 100644 index 00000000..70c97c37 --- /dev/null +++ b/systems/Spock/config-command @@ -0,0 +1,12 @@ +../../configure --enable-comms=mpi-auto \ +--enable-unified=no \ +--enable-shm=nvlink \ +--enable-accelerator=hip \ +--enable-gen-simd-width=64 \ +--enable-simd=GPU \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=hipcc MPICXX=mpicxx \ +CXXFLAGS="-fPIC -I/opt/rocm-4.3.0/include/ -std=c++14 -I${MPICH_DIR}/include " \ +--prefix=/ccs/home/chulwoo/Grid \ + LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa " From 14d82777e0da2c1dc8e6157f3f4344cab24461b5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 22 Nov 2021 20:47:16 -0500 Subject: [PATCH 12/30] Best modules for spock --- systems/Spock/sourceme.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 systems/Spock/sourceme.sh diff --git a/systems/Spock/sourceme.sh b/systems/Spock/sourceme.sh new file mode 100644 index 00000000..40d864b5 --- /dev/null +++ b/systems/Spock/sourceme.sh @@ -0,0 +1,5 @@ +module load PrgEnv-gnu +module load rocm/4.3.0 +module load gmp +module load cray-fftw +module load craype-accel-amd-gfx908 From 6d5277f2d78633913a67612f68b887e664bc7b6a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 22 Nov 2021 20:58:02 -0500 Subject: [PATCH 13/30] Update to Spock --- systems/Spock/dwf.slurm | 26 ++++++++++++++++++++++++++ systems/Spock/dwf4.slurm | 26 ++++++++++++++++++++++++++ systems/Spock/mpiwrapper.sh | 12 ++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 systems/Spock/dwf.slurm create mode 100644 systems/Spock/dwf4.slurm create mode 100755 systems/Spock/mpiwrapper.sh diff --git a/systems/Spock/dwf.slurm b/systems/Spock/dwf.slurm new file mode 100644 index 00000000..7144a270 --- /dev/null +++ b/systems/Spock/dwf.slurm @@ -0,0 +1,26 @@ +#!/bin/bash +# Begin LSF Directives +#SBATCH -A LGT104 +#SBATCH -t 01:00:00 +##SBATCH -U openmpThu +#SBATCH -p ecp +#SBATCH -J DWF +#SBATCH -o DWF.%J +#SBATCH -e DWF.%J +#SBATCH -N 1 +#SBATCH -n 1 + +DIR=. +module list +export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +export MPICH_GPU_SUPPORT_ENABLED=1 +#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +#export MPICH_SMP_SINGLE_COPY_MODE=NONE +export MPICH_SMP_SINGLE_COPY_MODE=CMA +export OMP_NUM_THREADS=8 + +AT=8 +echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE +PARAMS=" --accelerator-threads ${AT} --grid 32.32.32.32 --mpi 1.1.1.1 --comms-overlap" +srun -n1 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS + diff --git a/systems/Spock/dwf4.slurm b/systems/Spock/dwf4.slurm new file mode 100644 index 00000000..02a7b888 --- /dev/null +++ b/systems/Spock/dwf4.slurm @@ -0,0 +1,26 @@ +#!/bin/bash +# Begin LSF Directives +#SBATCH -A LGT104 +#SBATCH -t 01:00:00 +##SBATCH -U openmpThu +#SBATCH -p ecp +#SBATCH -J DWF +#SBATCH -o DWF.%J +#SBATCH -e DWF.%J +#SBATCH -N 1 +#SBATCH -n 4 + +DIR=. +module list +export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +export MPICH_GPU_SUPPORT_ENABLED=1 +#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +#export MPICH_SMP_SINGLE_COPY_MODE=NONE +export MPICH_SMP_SINGLE_COPY_MODE=CMA +export OMP_NUM_THREADS=8 + +AT=8 +echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE +PARAMS=" --accelerator-threads ${AT} --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0" +srun -n4 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS + diff --git a/systems/Spock/mpiwrapper.sh b/systems/Spock/mpiwrapper.sh new file mode 100755 index 00000000..76c4e364 --- /dev/null +++ b/systems/Spock/mpiwrapper.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +lrank=$SLURM_LOCALID + +export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID + +echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING" + +$* + + + From e32d5141b4f6d74105d33349173c93e4e382911f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 22 Nov 2021 21:46:31 -0500 Subject: [PATCH 14/30] Updated to make MPI reliable still gives good perf, but MPI will be slow intranode --- systems/Spock/dwf4.slurm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/systems/Spock/dwf4.slurm b/systems/Spock/dwf4.slurm index 02a7b888..261929ab 100644 --- a/systems/Spock/dwf4.slurm +++ b/systems/Spock/dwf4.slurm @@ -15,8 +15,8 @@ module list export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 export MPICH_GPU_SUPPORT_ENABLED=1 #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM -#export MPICH_SMP_SINGLE_COPY_MODE=NONE -export MPICH_SMP_SINGLE_COPY_MODE=CMA +export MPICH_SMP_SINGLE_COPY_MODE=NONE +#export MPICH_SMP_SINGLE_COPY_MODE=CMA export OMP_NUM_THREADS=8 AT=8 From f34d34bd17307b37e89de51743fe26482bd161af Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 22 Nov 2021 22:27:16 -0500 Subject: [PATCH 15/30] 2 nodes --- systems/Spock/dwf8.slurm | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 systems/Spock/dwf8.slurm diff --git a/systems/Spock/dwf8.slurm b/systems/Spock/dwf8.slurm new file mode 100644 index 00000000..c4672db0 --- /dev/null +++ b/systems/Spock/dwf8.slurm @@ -0,0 +1,26 @@ +#!/bin/bash +# Begin LSF Directives +#SBATCH -A LGT104 +#SBATCH -t 01:00:00 +##SBATCH -U openmpThu +#SBATCH -p ecp +#SBATCH -J DWF +#SBATCH -o DWF.%J +#SBATCH -e DWF.%J +#SBATCH -N 2 +#SBATCH -n 8 + +DIR=. +module list +export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +export MPICH_GPU_SUPPORT_ENABLED=1 +#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +export MPICH_SMP_SINGLE_COPY_MODE=NONE +#export MPICH_SMP_SINGLE_COPY_MODE=CMA +export OMP_NUM_THREADS=8 + +AT=8 +echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE +PARAMS=" --accelerator-threads ${AT} --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0" +srun -n8 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS + From 2bf3b4d5765b9db6d73fc4b3795971f851cb7d0f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 7 Dec 2021 09:02:02 -0800 Subject: [PATCH 16/30] Update to reduce memory footpring in benchmark test --- Grid/communicator/Communicator_mpi3.cc | 4 +- Grid/threads/Accelerator.h | 2 +- benchmarks/Benchmark_dwf_fp32.cc | 61 ++++++++++++++++++-------- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 305a3a9b..162180bc 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -388,6 +388,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); assert(shm!=NULL); + std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl; acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); } @@ -399,12 +400,13 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list,int dir) { + acceleratorCopySynchronise(); std::cout << "Copy Synchronised\n"< status(nreq); - acceleratorCopySynchronise(); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); assert(ierr==0); list.resize(0); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index cec0600f..8be712ba 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -306,7 +306,7 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); } +inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index d48486c0..4edf7c16 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -126,19 +126,10 @@ int main (int argc, char ** argv) // Naive wilson implementation //////////////////////////////////// // replicate across fifth dimension - LatticeGaugeFieldF Umu5d(FGrid); - std::vector U(4,FGrid); - { - autoView( Umu5d_v, Umu5d, CpuWrite); - autoView( Umu_v , Umu , CpuRead); - for(int ss=0;ssoSites();ss++){ - for(int s=0;s U(4,UGrid); for(int mu=0;mu(Umu5d,mu); + U[mu] = PeekIndex(Umu,mu); } std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; @@ -147,10 +138,28 @@ int main (int argc, char ** argv) ref = Zero(); for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s Date: Tue, 7 Dec 2021 16:24:24 -0500 Subject: [PATCH 17/30] Less verbose --- Grid/communicator/Communicator_mpi3.cc | 5 +++-- benchmarks/Benchmark_dwf_fp32.cc | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 162180bc..7b3e8847 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -388,7 +388,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); assert(shm!=NULL); - std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl; + // std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl; acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); } @@ -400,7 +400,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list,int dir) { - acceleratorCopySynchronise(); std::cout << "Copy Synchronised\n"<Barrier(); From b4f8e87982b1db0f964538eb69bd6cf9b6388125 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Tue, 1 Feb 2022 23:08:09 +0100 Subject: [PATCH 18/30] Have Grid's cli interface understand floats --- Grid/util/Init.cc | 7 +++++++ Grid/util/Init.h | 1 + 2 files changed, 8 insertions(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 697f3ac1..6992129e 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -167,6 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val) return; } +void GridCmdOptionFloat(std::string &str,float & val) +{ + std::stringstream ss(str); + ss>>val; + return; +} + void GridParseLayout(char **argv,int argc, Coordinate &latt_c, diff --git a/Grid/util/Init.h b/Grid/util/Init.h index 4eb8f06c..585660a1 100644 --- a/Grid/util/Init.h +++ b/Grid/util/Init.h @@ -57,6 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector & vec); template void GridCmdOptionIntVector(const std::string &str,VectorInt & vec); void GridCmdOptionInt(std::string &str,int & val); +void GridCmdOptionFloat(std::string &str,float & val); void GridParseLayout(char **argv,int argc, From e83423fee65743deb55023986e9765fbda431e7f Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Tue, 1 Feb 2022 20:50:09 +0100 Subject: [PATCH 19/30] Refactor clover to align with other files and prepare for upcoming changes --- Grid/qcd/action/fermion/WilsonCloverFermion.h | 301 ++---------------- Grid/qcd/action/fermion/WilsonCloverHelpers.h | 191 +++++++++++ Grid/qcd/action/fermion/WilsonCloverTypes.h | 49 +++ .../WilsonCloverFermionImplementation.h | 163 +++++++++- 4 files changed, 410 insertions(+), 294 deletions(-) create mode 100644 Grid/qcd/action/fermion/WilsonCloverHelpers.h create mode 100644 Grid/qcd/action/fermion/WilsonCloverTypes.h diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index 92af7111..cd19fc10 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -4,10 +4,11 @@ Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h - Copyright (C) 2017 + Copyright (C) 2017 - 2022 Author: Guido Cossu Author: David Preti <> + Author: Daniel Richtmann This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -29,7 +30,8 @@ #pragma once -#include +#include +#include NAMESPACE_BEGIN(Grid); @@ -50,18 +52,15 @@ NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////// template -class WilsonCloverFermion : public WilsonFermion +class WilsonCloverFermion : public WilsonFermion, + public WilsonCloverHelpers { public: - // Types definitions INHERIT_IMPL_TYPES(Impl); - template - using iImplClover = iScalar, Ns>>; - typedef iImplClover SiteCloverType; - typedef Lattice CloverFieldType; + INHERIT_CLOVER_TYPES(Impl); -public: - typedef WilsonFermion WilsonBase; + typedef WilsonFermion WilsonBase; + typedef WilsonCloverHelpers Helpers; virtual int ConstEE(void) { return 0; }; virtual void Instantiatable(void){}; @@ -72,42 +71,7 @@ public: const RealD _csw_r = 0.0, const RealD _csw_t = 0.0, const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(), - const ImplParams &impl_p = ImplParams()) : WilsonFermion(_Umu, - Fgrid, - Hgrid, - _mass, impl_p, clover_anisotropy), - CloverTerm(&Fgrid), - CloverTermInv(&Fgrid), - CloverTermEven(&Hgrid), - CloverTermOdd(&Hgrid), - CloverTermInvEven(&Hgrid), - CloverTermInvOdd(&Hgrid), - CloverTermDagEven(&Hgrid), - CloverTermDagOdd(&Hgrid), - CloverTermInvDagEven(&Hgrid), - CloverTermInvDagOdd(&Hgrid) - { - assert(Nd == 4); // require 4 dimensions - - if (clover_anisotropy.isAnisotropic) - { - csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0; - diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0); - } - else - { - csw_r = _csw_r * 0.5; - diag_mass = 4.0 + _mass; - } - csw_t = _csw_t * 0.5; - - if (csw_r == 0) - std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl; - if (csw_t == 0) - std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl; - - ImportGauge(_Umu); - } + const ImplParams &impl_p = ImplParams()); virtual void M(const FermionField &in, FermionField &out); virtual void Mdag(const FermionField &in, FermionField &out); @@ -124,250 +88,21 @@ public: void ImportGauge(const GaugeField &_Umu); // Derivative parts unpreconditioned pseudofermions - void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag) - { - conformable(X.Grid(), Y.Grid()); - conformable(X.Grid(), force.Grid()); - GaugeLinkField force_mu(force.Grid()), lambda(force.Grid()); - GaugeField clover_force(force.Grid()); - PropagatorField Lambda(force.Grid()); + void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag); - // Guido: Here we are hitting some performance issues: - // need to extract the components of the DoubledGaugeField - // for each call - // Possible solution - // Create a vector object to store them? (cons: wasting space) - std::vector U(Nd, this->Umu.Grid()); - - Impl::extractLinkField(U, this->Umu); - - force = Zero(); - // Derivative of the Wilson hopping term - this->DhopDeriv(force, X, Y, dag); - - /////////////////////////////////////////////////////////// - // Clover term derivative - /////////////////////////////////////////////////////////// - Impl::outerProductImpl(Lambda, X, Y); - //std::cout << "Lambda:" << Lambda << std::endl; - - Gamma::Algebra sigma[] = { - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::MinusSigmaZT}; - - /* - sigma_{\mu \nu}= - | 0 sigma[0] sigma[1] sigma[2] | - | sigma[3] 0 sigma[4] sigma[5] | - | sigma[6] sigma[7] 0 sigma[8] | - | sigma[9] sigma[10] sigma[11] 0 | - */ - - int count = 0; - clover_force = Zero(); - for (int mu = 0; mu < 4; mu++) - { - force_mu = Zero(); - for (int nu = 0; nu < 4; nu++) - { - if (mu == nu) - continue; - - RealD factor; - if (nu == 4 || mu == 4) - { - factor = 2.0 * csw_t; - } - else - { - factor = 2.0 * csw_r; - } - PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked - Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok - force_mu -= factor*Cmunu(U, lambda, mu, nu); // checked - count++; - } - - pokeLorentz(clover_force, U[mu] * force_mu, mu); - } - //clover_force *= csw; - force += clover_force; - } - - // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis - GaugeLinkField Cmunu(std::vector &U, GaugeLinkField &lambda, int mu, int nu) - { - conformable(lambda.Grid(), U[0].Grid()); - GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid()); - // insertion in upper staple - // please check redundancy of shift operations - - // C1+ - tmp = lambda * U[nu]; - out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu); - - // C2+ - tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu); - out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu); - - // C3+ - tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu); - out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu); - - // C4+ - out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda; - - // insertion in lower staple - // C1- - out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu); - - // C2- - tmp = adj(lambda) * U[nu]; - out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu); - - // C3- - tmp = lambda * U[nu]; - out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu); - - // C4- - out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda; - - return out; - } - -protected: +public: // here fixing the 4 dimensions, make it more general? RealD csw_r; // Clover coefficient - spatial RealD csw_t; // Clover coefficient - temporal RealD diag_mass; // Mass term - CloverFieldType CloverTerm, CloverTermInv; // Clover term - CloverFieldType CloverTermEven, CloverTermOdd; // Clover term EO - CloverFieldType CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO - CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO - CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO - - public: - // eventually these can be compressed into 6x6 blocks instead of the 12x12 - // using the DeGrand-Rossi basis for the gamma matrices - CloverFieldType fillCloverYZ(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - T = Zero(); - autoView(T_v,T,AcceleratorWrite); - autoView(F_v,F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); - T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); - T_v[i]()(2, 3) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 2) = timesMinusI(F_v[i]()()); - }); - - return T; - } - - CloverFieldType fillCloverXZ(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - T = Zero(); - - autoView(T_v, T,AcceleratorWrite); - autoView(F_v, F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 1) = -F_v[i]()(); - T_v[i]()(1, 0) = F_v[i]()(); - T_v[i]()(2, 3) = -F_v[i]()(); - T_v[i]()(3, 2) = F_v[i]()(); - }); - - return T; - } - - CloverFieldType fillCloverXY(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - T = Zero(); - - autoView(T_v,T,AcceleratorWrite); - autoView(F_v,F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); - T_v[i]()(1, 1) = timesI(F_v[i]()()); - T_v[i]()(2, 2) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 3) = timesI(F_v[i]()()); - }); - - return T; - } - - CloverFieldType fillCloverXT(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - T = Zero(); - - autoView( T_v , T, AcceleratorWrite); - autoView( F_v , F, AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 1) = timesI(F_v[i]()()); - T_v[i]()(1, 0) = timesI(F_v[i]()()); - T_v[i]()(2, 3) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 2) = timesMinusI(F_v[i]()()); - }); - - return T; - } - - CloverFieldType fillCloverYT(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - T = Zero(); - - autoView( T_v ,T,AcceleratorWrite); - autoView( F_v ,F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 1) = -(F_v[i]()()); - T_v[i]()(1, 0) = (F_v[i]()()); - T_v[i]()(2, 3) = (F_v[i]()()); - T_v[i]()(3, 2) = -(F_v[i]()()); - }); - - return T; - } - - CloverFieldType fillCloverZT(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - - T = Zero(); - - autoView( T_v , T,AcceleratorWrite); - autoView( F_v , F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 0) = timesI(F_v[i]()()); - T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); - T_v[i]()(2, 2) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 3) = timesI(F_v[i]()()); - }); - - return T; - } + CloverField CloverTerm, CloverTermInv; // Clover term + CloverField CloverTermEven, CloverTermOdd; // Clover term EO + CloverField CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO + CloverField CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO + CloverField CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO }; + NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/WilsonCloverHelpers.h b/Grid/qcd/action/fermion/WilsonCloverHelpers.h new file mode 100644 index 00000000..80f9032c --- /dev/null +++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h @@ -0,0 +1,191 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h + + Copyright (C) 2021 - 2022 + + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#pragma once + +// Helper routines that implement common clover functionality + +NAMESPACE_BEGIN(Grid); + +template class WilsonCloverHelpers { +public: + + INHERIT_IMPL_TYPES(Impl); + INHERIT_CLOVER_TYPES(Impl); + + // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis + static GaugeLinkField Cmunu(std::vector &U, GaugeLinkField &lambda, int mu, int nu) + { + conformable(lambda.Grid(), U[0].Grid()); + GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid()); + // insertion in upper staple + // please check redundancy of shift operations + + // C1+ + tmp = lambda * U[nu]; + out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu); + + // C2+ + tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu); + out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu); + + // C3+ + tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu); + out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu); + + // C4+ + out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda; + + // insertion in lower staple + // C1- + out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu); + + // C2- + tmp = adj(lambda) * U[nu]; + out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu); + + // C3- + tmp = lambda * U[nu]; + out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu); + + // C4- + out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda; + + return out; + } + + static CloverField fillCloverYZ(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + T = Zero(); + autoView(T_v,T,AcceleratorWrite); + autoView(F_v,F,AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),1, + { + T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); + T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); + T_v[i]()(2, 3) = timesMinusI(F_v[i]()()); + T_v[i]()(3, 2) = timesMinusI(F_v[i]()()); + }); + + return T; + } + + static CloverField fillCloverXZ(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + T = Zero(); + + autoView(T_v, T,AcceleratorWrite); + autoView(F_v, F,AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),1, + { + T_v[i]()(0, 1) = -F_v[i]()(); + T_v[i]()(1, 0) = F_v[i]()(); + T_v[i]()(2, 3) = -F_v[i]()(); + T_v[i]()(3, 2) = F_v[i]()(); + }); + + return T; + } + + static CloverField fillCloverXY(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + T = Zero(); + + autoView(T_v,T,AcceleratorWrite); + autoView(F_v,F,AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),1, + { + T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); + T_v[i]()(1, 1) = timesI(F_v[i]()()); + T_v[i]()(2, 2) = timesMinusI(F_v[i]()()); + T_v[i]()(3, 3) = timesI(F_v[i]()()); + }); + + return T; + } + + static CloverField fillCloverXT(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + T = Zero(); + + autoView( T_v , T, AcceleratorWrite); + autoView( F_v , F, AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),1, + { + T_v[i]()(0, 1) = timesI(F_v[i]()()); + T_v[i]()(1, 0) = timesI(F_v[i]()()); + T_v[i]()(2, 3) = timesMinusI(F_v[i]()()); + T_v[i]()(3, 2) = timesMinusI(F_v[i]()()); + }); + + return T; + } + + static CloverField fillCloverYT(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + T = Zero(); + + autoView( T_v ,T,AcceleratorWrite); + autoView( F_v ,F,AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),1, + { + T_v[i]()(0, 1) = -(F_v[i]()()); + T_v[i]()(1, 0) = (F_v[i]()()); + T_v[i]()(2, 3) = (F_v[i]()()); + T_v[i]()(3, 2) = -(F_v[i]()()); + }); + + return T; + } + + static CloverField fillCloverZT(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + + T = Zero(); + + autoView( T_v , T,AcceleratorWrite); + autoView( F_v , F,AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),1, + { + T_v[i]()(0, 0) = timesI(F_v[i]()()); + T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); + T_v[i]()(2, 2) = timesMinusI(F_v[i]()()); + T_v[i]()(3, 3) = timesI(F_v[i]()()); + }); + + return T; + } +}; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/WilsonCloverTypes.h b/Grid/qcd/action/fermion/WilsonCloverTypes.h new file mode 100644 index 00000000..4428259f --- /dev/null +++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h @@ -0,0 +1,49 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h + + Copyright (C) 2021 - 2022 + + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +class WilsonCloverTypes { +public: + INHERIT_IMPL_TYPES(Impl); + + template using iImplClover = iScalar, Ns>>; + + typedef iImplClover SiteClover; + + typedef Lattice CloverField; +}; + +#define INHERIT_CLOVER_TYPES(Impl) \ + typedef typename WilsonCloverTypes::SiteClover SiteClover; \ + typedef typename WilsonCloverTypes::CloverField CloverField; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index 3032a80c..fd81af11 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -2,12 +2,13 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc + Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h - Copyright (C) 2017 + Copyright (C) 2017 - 2022 Author: paboyle Author: Guido Cossu + Author: Daniel Richtmann This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,6 +34,45 @@ NAMESPACE_BEGIN(Grid); +template +WilsonCloverFermion::WilsonCloverFermion(GaugeField& _Umu, + GridCartesian& Fgrid, + GridRedBlackCartesian& Hgrid, + const RealD _mass, + const RealD _csw_r, + const RealD _csw_t, + const WilsonAnisotropyCoefficients& clover_anisotropy, + const ImplParams& impl_p) + : WilsonFermion(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy) + , CloverTerm(&Fgrid) + , CloverTermInv(&Fgrid) + , CloverTermEven(&Hgrid) + , CloverTermOdd(&Hgrid) + , CloverTermInvEven(&Hgrid) + , CloverTermInvOdd(&Hgrid) + , CloverTermDagEven(&Hgrid) + , CloverTermDagOdd(&Hgrid) + , CloverTermInvDagEven(&Hgrid) + , CloverTermInvDagOdd(&Hgrid) { + assert(Nd == 4); // require 4 dimensions + + if(clover_anisotropy.isAnisotropic) { + csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0; + diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0); + } else { + csw_r = _csw_r * 0.5; + diag_mass = 4.0 + _mass; + } + csw_t = _csw_t * 0.5; + + if(csw_r == 0) + std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl; + if(csw_t == 0) + std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl; + + ImportGauge(_Umu); +} + // *NOT* EO template void WilsonCloverFermion::M(const FermionField &in, FermionField &out) @@ -67,10 +107,13 @@ void WilsonCloverFermion::Mdag(const FermionField &in, FermionField &out) template void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) { + double t0 = usecond(); WilsonFermion::ImportGauge(_Umu); + double t1 = usecond(); GridBase *grid = _Umu.Grid(); typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); + double t2 = usecond(); // Compute the field strength terms mu>nu WilsonLoops::FieldStrength(Bx, _Umu, Zdir, Ydir); WilsonLoops::FieldStrength(By, _Umu, Zdir, Xdir); @@ -79,19 +122,22 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) WilsonLoops::FieldStrength(Ey, _Umu, Tdir, Ydir); WilsonLoops::FieldStrength(Ez, _Umu, Tdir, Zdir); + double t3 = usecond(); // Compute the Clover Operator acting on Colour and Spin // multiply here by the clover coefficients for the anisotropy - CloverTerm = fillCloverYZ(Bx) * csw_r; - CloverTerm += fillCloverXZ(By) * csw_r; - CloverTerm += fillCloverXY(Bz) * csw_r; - CloverTerm += fillCloverXT(Ex) * csw_t; - CloverTerm += fillCloverYT(Ey) * csw_t; - CloverTerm += fillCloverZT(Ez) * csw_t; + CloverTerm = Helpers::fillCloverYZ(Bx) * csw_r; + CloverTerm += Helpers::fillCloverXZ(By) * csw_r; + CloverTerm += Helpers::fillCloverXY(Bz) * csw_r; + CloverTerm += Helpers::fillCloverXT(Ex) * csw_t; + CloverTerm += Helpers::fillCloverYT(Ey) * csw_t; + CloverTerm += Helpers::fillCloverZT(Ez) * csw_t; CloverTerm += diag_mass; + double t4 = usecond(); int lvol = _Umu.Grid()->lSites(); int DimRep = Impl::Dimension; + double t5 = usecond(); { autoView(CTv,CloverTerm,CpuRead); autoView(CTIv,CloverTermInv,CpuWrite); @@ -100,7 +146,7 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) grid->LocalIndexToLocalCoor(site, lcoor); Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); + typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero(); peekLocalSite(Qx, CTv, lcoor); //if (csw!=0){ for (int j = 0; j < Ns; j++) @@ -125,6 +171,7 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) }); } + double t6 = usecond(); // Separate the even and odd parts pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm); @@ -137,6 +184,20 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); + double t7 = usecond(); + +#if 0 + std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:" + << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 + << ", allocations = " << (t2 - t1) / 1e6 + << ", field strength = " << (t3 - t2) / 1e6 + << ", fill clover = " << (t4 - t3) / 1e6 + << ", misc = " << (t5 - t4) / 1e6 + << ", inversions = " << (t6 - t5) / 1e6 + << ", pick cbs = " << (t7 - t6) / 1e6 + << ", total = " << (t7 - t0) / 1e6 + << std::endl; +#endif } template @@ -167,7 +228,7 @@ template void WilsonCloverFermion::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv) { out.Checkerboard() = in.Checkerboard(); - CloverFieldType *Clover; + CloverField *Clover; assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); if (dag) @@ -214,9 +275,89 @@ void WilsonCloverFermion::MooeeInternal(const FermionField &in, FermionFie out = *Clover * in; } } - } // MooeeInternal +// Derivative parts unpreconditioned pseudofermions +template +void WilsonCloverFermion::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag) +{ + conformable(X.Grid(), Y.Grid()); + conformable(X.Grid(), force.Grid()); + GaugeLinkField force_mu(force.Grid()), lambda(force.Grid()); + GaugeField clover_force(force.Grid()); + PropagatorField Lambda(force.Grid()); + + // Guido: Here we are hitting some performance issues: + // need to extract the components of the DoubledGaugeField + // for each call + // Possible solution + // Create a vector object to store them? (cons: wasting space) + std::vector U(Nd, this->Umu.Grid()); + + Impl::extractLinkField(U, this->Umu); + + force = Zero(); + // Derivative of the Wilson hopping term + this->DhopDeriv(force, X, Y, dag); + + /////////////////////////////////////////////////////////// + // Clover term derivative + /////////////////////////////////////////////////////////// + Impl::outerProductImpl(Lambda, X, Y); + //std::cout << "Lambda:" << Lambda << std::endl; + + Gamma::Algebra sigma[] = { + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::MinusSigmaZT}; + + /* + sigma_{\mu \nu}= + | 0 sigma[0] sigma[1] sigma[2] | + | sigma[3] 0 sigma[4] sigma[5] | + | sigma[6] sigma[7] 0 sigma[8] | + | sigma[9] sigma[10] sigma[11] 0 | + */ + + int count = 0; + clover_force = Zero(); + for (int mu = 0; mu < 4; mu++) + { + force_mu = Zero(); + for (int nu = 0; nu < 4; nu++) + { + if (mu == nu) + continue; + + RealD factor; + if (nu == 4 || mu == 4) + { + factor = 2.0 * csw_t; + } + else + { + factor = 2.0 * csw_r; + } + PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked + Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok + force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked + count++; + } + + pokeLorentz(clover_force, U[mu] * force_mu, mu); + } + //clover_force *= csw; + force += clover_force; +} // Derivative parts template From 0b6fd20c5472f25c272a51a8e023b39cb85d0faf Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Tue, 1 Feb 2022 21:19:50 +0100 Subject: [PATCH 20/30] Enable memory coalescing in clover term generation --- Grid/qcd/action/fermion/WilsonCloverHelpers.h | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCloverHelpers.h b/Grid/qcd/action/fermion/WilsonCloverHelpers.h index 80f9032c..daf57f78 100644 --- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h +++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h @@ -85,12 +85,12 @@ public: T = Zero(); autoView(T_v,T,AcceleratorWrite); autoView(F_v,F,AcceleratorRead); - accelerator_for(i, T.Grid()->oSites(),1, + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), { - T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); - T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); - T_v[i]()(2, 3) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 2) = timesMinusI(F_v[i]()()); + coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()()))); }); return T; @@ -103,12 +103,12 @@ public: autoView(T_v, T,AcceleratorWrite); autoView(F_v, F,AcceleratorRead); - accelerator_for(i, T.Grid()->oSites(),1, + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), { - T_v[i]()(0, 1) = -F_v[i]()(); - T_v[i]()(1, 0) = F_v[i]()(); - T_v[i]()(2, 3) = -F_v[i]()(); - T_v[i]()(3, 2) = F_v[i]()(); + coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()())); + coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()())); + coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()())); + coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()())); }); return T; @@ -121,12 +121,12 @@ public: autoView(T_v,T,AcceleratorWrite); autoView(F_v,F,AcceleratorRead); - accelerator_for(i, T.Grid()->oSites(),1, + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), { - T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); - T_v[i]()(1, 1) = timesI(F_v[i]()()); - T_v[i]()(2, 2) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 3) = timesI(F_v[i]()()); + coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()()))); + coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()()))); }); return T; @@ -139,12 +139,12 @@ public: autoView( T_v , T, AcceleratorWrite); autoView( F_v , F, AcceleratorRead); - accelerator_for(i, T.Grid()->oSites(),1, + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), { - T_v[i]()(0, 1) = timesI(F_v[i]()()); - T_v[i]()(1, 0) = timesI(F_v[i]()()); - T_v[i]()(2, 3) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 2) = timesMinusI(F_v[i]()()); + coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()()))); + coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()()))); + coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()()))); }); return T; @@ -157,12 +157,12 @@ public: autoView( T_v ,T,AcceleratorWrite); autoView( F_v ,F,AcceleratorRead); - accelerator_for(i, T.Grid()->oSites(),1, + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), { - T_v[i]()(0, 1) = -(F_v[i]()()); - T_v[i]()(1, 0) = (F_v[i]()()); - T_v[i]()(2, 3) = (F_v[i]()()); - T_v[i]()(3, 2) = -(F_v[i]()()); + coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()()))); + coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()()))); + coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()()))); + coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()()))); }); return T; @@ -176,12 +176,12 @@ public: autoView( T_v , T,AcceleratorWrite); autoView( F_v , F,AcceleratorRead); - accelerator_for(i, T.Grid()->oSites(),1, + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), { - T_v[i]()(0, 0) = timesI(F_v[i]()()); - T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); - T_v[i]()(2, 2) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 3) = timesI(F_v[i]()()); + coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()()))); + coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()()))); }); return T; From add86cd7f4070958ae993bd8dff0c6d91c9744e8 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Tue, 1 Feb 2022 21:22:45 +0100 Subject: [PATCH 21/30] Abandon ET for clover application, use construct similar to multLink --- Grid/qcd/action/fermion/WilsonCloverHelpers.h | 20 +++++++++++++++++++ .../WilsonCloverFermionImplementation.h | 8 ++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCloverHelpers.h b/Grid/qcd/action/fermion/WilsonCloverHelpers.h index daf57f78..337af647 100644 --- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h +++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h @@ -186,6 +186,26 @@ public: return T; } + + template + static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) { + auto CC = coalescedRead(C); + mult(&phi, &CC, &chi); + } + + template + inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) { + const int Nsimd = SiteSpinor::Nsimd(); + autoView(out_v, out, AcceleratorWrite); + autoView(phi_v, phi, AcceleratorRead); + autoView(C_v, C, AcceleratorRead); + typedef decltype(coalescedRead(out_v[0])) calcSpinor; + accelerator_for(sss,out.Grid()->oSites(),Nsimd,{ + calcSpinor tmp; + multClover(tmp,C_v[sss],phi_v(sss)); + coalescedWrite(out_v[sss],tmp); + }); + } }; NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index fd81af11..e4d2e736 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -243,12 +243,12 @@ void WilsonCloverFermion::MooeeInternal(const FermionField &in, FermionFie { Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven; } - out = *Clover * in; + Helpers::multCloverField(out, *Clover, in); } else { Clover = (inv) ? &CloverTermInv : &CloverTerm; - out = adj(*Clover) * in; + Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway } } else @@ -266,13 +266,13 @@ void WilsonCloverFermion::MooeeInternal(const FermionField &in, FermionFie // std::cout << "Calling clover term Even" << std::endl; Clover = (inv) ? &CloverTermInvEven : &CloverTermEven; } - out = *Clover * in; + Helpers::multCloverField(out, *Clover, in); // std::cout << GridLogMessage << "*Clover.Checkerboard() " << (*Clover).Checkerboard() << std::endl; } else { Clover = (inv) ? &CloverTermInv : &CloverTerm; - out = *Clover * in; + Helpers::multCloverField(out, *Clover, in); } } } // MooeeInternal From 3082ab82524dfc3ff0d83e0df34e3d6f930d3f1b Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Tue, 1 Feb 2022 21:39:52 +0100 Subject: [PATCH 22/30] Check in compact version of wilson clover fermions --- .../fermion/CompactWilsonCloverFermion.h | 494 ++++++++++++++++ Grid/qcd/action/fermion/WilsonCloverHelpers.h | 550 ++++++++++++++++++ Grid/qcd/action/fermion/WilsonCloverTypes.h | 43 ++ .../Test_compact_wilson_clover_speedup.cc | 226 +++++++ 4 files changed, 1313 insertions(+) create mode 100644 Grid/qcd/action/fermion/CompactWilsonCloverFermion.h create mode 100644 tests/core/Test_compact_wilson_clover_speedup.cc diff --git a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h new file mode 100644 index 00000000..550bef85 --- /dev/null +++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h @@ -0,0 +1,494 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h + + Copyright (C) 2020 - 2022 + + Author: Daniel Richtmann + Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#pragma once + +#include +#include + +NAMESPACE_BEGIN(Grid); + +// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description +// +// Modifications done here: +// +// Original: clover term = 12x12 matrix per site +// +// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running) +// Sufficient to store/transfer only the real parts of the diagonal and one triangular part +// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered +// +// Here: Above but diagonal as complex numbers, i.e., need to store/transfer +// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words +// +// Words per site and improvement compared to original (combined with the input and output spinors): +// +// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less +// - Minimal: 2*12 + 36 = 60 words -> 2.80 x less +// - Here: 2*12 + 42 = 66 words -> 2.55 x less +// +// These improvements directly translate to wall-clock time +// +// Data layout: +// +// - diagonal and triangle part as separate lattice fields, +// this was faster than as 1 combined field on all tested machines +// - diagonal: as expected +// - triangle: store upper right triangle in row major order +// - graphical: +// 0 1 2 3 4 +// 5 6 7 8 +// 9 10 11 = upper right triangle indices +// 12 13 +// 14 +// 0 +// 1 +// 2 +// 3 = diagonal indices +// 4 +// 5 +// 0 +// 1 5 +// 2 6 9 = lower left triangle indices +// 3 7 10 12 +// 4 8 11 13 14 +// +// Impact on total memory consumption: +// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site +// - Here: (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts = 24 complex words per site +// + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts = 60 complex words per site +// = 84 complex words per site + +template +class CompactWilsonCloverFermion : public WilsonFermion, + public WilsonCloverHelpers, + public CompactWilsonCloverHelpers { + ///////////////////////////////////////////// + // Sizes + ///////////////////////////////////////////// + +public: + + INHERIT_COMPACT_CLOVER_SIZES(Impl); + + ///////////////////////////////////////////// + // Type definitions + ///////////////////////////////////////////// + +public: + + INHERIT_IMPL_TYPES(Impl); + INHERIT_CLOVER_TYPES(Impl); + INHERIT_COMPACT_CLOVER_TYPES(Impl); + + typedef WilsonFermion WilsonBase; + typedef WilsonCloverHelpers Helpers; + typedef CompactWilsonCloverHelpers CompactHelpers; + + ///////////////////////////////////////////// + // Constructors + ///////////////////////////////////////////// + +public: + + CompactWilsonCloverFermion(GaugeField& _Umu, + GridCartesian& Fgrid, + GridRedBlackCartesian& Hgrid, + const RealD _mass, + const RealD _csw_r = 0.0, + const RealD _csw_t = 0.0, + const RealD _cF = 1.0, + const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(), + const ImplParams& impl_p = ImplParams()) + : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy) + , csw_r(_csw_r) + , csw_t(_csw_t) + , cF(_cF) + , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0) + , Diagonal(&Fgrid), Triangle(&Fgrid) + , DiagonalEven(&Hgrid), TriangleEven(&Hgrid) + , DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid) + , DiagonalInv(&Fgrid), TriangleInv(&Fgrid) + , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid) + , DiagonalInvOdd(&Hgrid), TriangleInvOdd(&Hgrid) + , Tmp(&Fgrid) + , BoundaryMask(&Fgrid) + , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid) + { + csw_r *= 0.5; + csw_t *= 0.5; + if (clover_anisotropy.isAnisotropic) + csw_r /= clover_anisotropy.xi_0; + + ImportGauge(_Umu); + if (open_boundaries) + CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd); + } + + ///////////////////////////////////////////// + // Member functions (implementing interface) + ///////////////////////////////////////////// + +public: + + virtual void Instantiatable() {}; + int ConstEE() override { return 0; }; + int isTrivialEE() override { return 0; }; + + + void Dhop(const FermionField& in, FermionField& out, int dag) override { + WilsonBase::Dhop(in, out, dag); + if(open_boundaries) ApplyBoundaryMask(out); + } + + void DhopOE(const FermionField& in, FermionField& out, int dag) override { + WilsonBase::DhopOE(in, out, dag); + if(open_boundaries) ApplyBoundaryMask(out); + } + + void DhopEO(const FermionField& in, FermionField& out, int dag) override { + WilsonBase::DhopEO(in, out, dag); + if(open_boundaries) ApplyBoundaryMask(out); + } + + void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override { + WilsonBase::DhopDir(in, out, dir, disp); + if(this->open_boundaries) ApplyBoundaryMask(out); + } + + void DhopDirAll(const FermionField& in, std::vector& out) /* override */ { + WilsonBase::DhopDirAll(in, out); + if(this->open_boundaries) { + for(auto& o : out) ApplyBoundaryMask(o); + } + } + + void M(const FermionField& in, FermionField& out) override { + out.Checkerboard() = in.Checkerboard(); + WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc + Mooee(in, Tmp); + axpy(out, 1.0, out, Tmp); + if(open_boundaries) ApplyBoundaryMask(out); + } + + void Mdag(const FermionField& in, FermionField& out) override { + out.Checkerboard() = in.Checkerboard(); + WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc + MooeeDag(in, Tmp); + axpy(out, 1.0, out, Tmp); + if(open_boundaries) ApplyBoundaryMask(out); + } + + void Meooe(const FermionField& in, FermionField& out) override { + WilsonBase::Meooe(in, out); + if(open_boundaries) ApplyBoundaryMask(out); + } + + void MeooeDag(const FermionField& in, FermionField& out) override { + WilsonBase::MeooeDag(in, out); + if(open_boundaries) ApplyBoundaryMask(out); + } + + void Mooee(const FermionField& in, FermionField& out) override { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + MooeeInternal(in, out, DiagonalOdd, TriangleOdd); + } else { + MooeeInternal(in, out, DiagonalEven, TriangleEven); + } + } else { + MooeeInternal(in, out, Diagonal, Triangle); + } + if(open_boundaries) ApplyBoundaryMask(out); + } + + void MooeeDag(const FermionField& in, FermionField& out) override { + Mooee(in, out); // blocks are hermitian + } + + void MooeeInv(const FermionField& in, FermionField& out) override { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd); + } else { + MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven); + } + } else { + MooeeInternal(in, out, DiagonalInv, TriangleInv); + } + if(open_boundaries) ApplyBoundaryMask(out); + } + + void MooeeInvDag(const FermionField& in, FermionField& out) override { + MooeeInv(in, out); // blocks are hermitian + } + + void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override { + DhopDir(in, out, dir, disp); + } + + void MdirAll(const FermionField& in, std::vector& out) override { + DhopDirAll(in, out); + } + + void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override { + assert(!open_boundaries); // TODO check for changes required for open bc + + // NOTE: code copied from original clover term + conformable(X.Grid(), Y.Grid()); + conformable(X.Grid(), force.Grid()); + GaugeLinkField force_mu(force.Grid()), lambda(force.Grid()); + GaugeField clover_force(force.Grid()); + PropagatorField Lambda(force.Grid()); + + // Guido: Here we are hitting some performance issues: + // need to extract the components of the DoubledGaugeField + // for each call + // Possible solution + // Create a vector object to store them? (cons: wasting space) + std::vector U(Nd, this->Umu.Grid()); + + Impl::extractLinkField(U, this->Umu); + + force = Zero(); + // Derivative of the Wilson hopping term + this->DhopDeriv(force, X, Y, dag); + + /////////////////////////////////////////////////////////// + // Clover term derivative + /////////////////////////////////////////////////////////// + Impl::outerProductImpl(Lambda, X, Y); + //std::cout << "Lambda:" << Lambda << std::endl; + + Gamma::Algebra sigma[] = { + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::MinusSigmaZT}; + + /* + sigma_{\mu \nu}= + | 0 sigma[0] sigma[1] sigma[2] | + | sigma[3] 0 sigma[4] sigma[5] | + | sigma[6] sigma[7] 0 sigma[8] | + | sigma[9] sigma[10] sigma[11] 0 | + */ + + int count = 0; + clover_force = Zero(); + for (int mu = 0; mu < 4; mu++) + { + force_mu = Zero(); + for (int nu = 0; nu < 4; nu++) + { + if (mu == nu) + continue; + + RealD factor; + if (nu == 4 || mu == 4) + { + factor = 2.0 * csw_t; + } + else + { + factor = 2.0 * csw_r; + } + PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked + Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok + force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked + count++; + } + + pokeLorentz(clover_force, U[mu] * force_mu, mu); + } + //clover_force *= csw; + force += clover_force; + } + + void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override { + assert(0); + } + + void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override { + assert(0); + } + + ///////////////////////////////////////////// + // Member functions (internals) + ///////////////////////////////////////////// + + void MooeeInternal(const FermionField& in, + FermionField& out, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle) { + assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); + out.Checkerboard() = in.Checkerboard(); + conformable(in, out); + conformable(in, diagonal); + conformable(in, triangle); + + CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle); + } + + ///////////////////////////////////////////// + // Helpers + ///////////////////////////////////////////// + + void ImportGauge(const GaugeField& _Umu) override { + // NOTE: parts copied from original implementation + + // Import gauge into base class + double t0 = usecond(); + WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that + + // Initialize temporary variables + double t1 = usecond(); + conformable(_Umu.Grid(), this->GaugeGrid()); + GridBase* grid = _Umu.Grid(); + typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); + CloverField TmpOriginal(grid); + + // Compute the field strength terms mu>nu + double t2 = usecond(); + WilsonLoops::FieldStrength(Bx, _Umu, Zdir, Ydir); + WilsonLoops::FieldStrength(By, _Umu, Zdir, Xdir); + WilsonLoops::FieldStrength(Bz, _Umu, Ydir, Xdir); + WilsonLoops::FieldStrength(Ex, _Umu, Tdir, Xdir); + WilsonLoops::FieldStrength(Ey, _Umu, Tdir, Ydir); + WilsonLoops::FieldStrength(Ez, _Umu, Tdir, Zdir); + + // Compute the Clover Operator acting on Colour and Spin + // multiply here by the clover coefficients for the anisotropy + double t3 = usecond(); + TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r; + TmpOriginal += Helpers::fillCloverXZ(By) * csw_r; + TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r; + TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t; + TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t; + TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t; + TmpOriginal += this->diag_mass; + + // Convert the data layout of the clover term + double t4 = usecond(); + CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); + + // Possible modify the boundary values + double t5 = usecond(); + if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); + + // Invert the clover term in the improved layout + double t6 = usecond(); + CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv); + + // Fill the remaining clover fields + double t7 = usecond(); + pickCheckerboard(Even, DiagonalEven, Diagonal); + pickCheckerboard(Even, TriangleEven, Triangle); + pickCheckerboard(Odd, DiagonalOdd, Diagonal); + pickCheckerboard(Odd, TriangleOdd, Triangle); + pickCheckerboard(Even, DiagonalInvEven, DiagonalInv); + pickCheckerboard(Even, TriangleInvEven, TriangleInv); + pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv); + pickCheckerboard(Odd, TriangleInvOdd, TriangleInv); + + // Report timings + double t8 = usecond(); +#if 0 + std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:" + << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 + << ", allocations = " << (t2 - t1) / 1e6 + << ", field strength = " << (t3 - t2) / 1e6 + << ", fill clover = " << (t4 - t3) / 1e6 + << ", convert = " << (t5 - t4) / 1e6 + << ", boundaries = " << (t6 - t5) / 1e6 + << ", inversions = " << (t7 - t6) / 1e6 + << ", pick cbs = " << (t8 - t7) / 1e6 + << ", total = " << (t8 - t0) / 1e6 + << std::endl; +#endif + } + + ///////////////////////////////////////////// + // Helpers + ///////////////////////////////////////////// + +private: + + template + const MaskField* getCorrectMaskField(const Field &in) const { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + return &this->BoundaryMaskOdd; + } else { + return &this->BoundaryMaskEven; + } + } else { + return &this->BoundaryMask; + } + } + + template + void ApplyBoundaryMask(Field& f) { + const MaskField* m = getCorrectMaskField(f); assert(m != nullptr); + assert(m != nullptr); + CompactHelpers::ApplyBoundaryMask(f, *m); + } + + ///////////////////////////////////////////// + // Member Data + ///////////////////////////////////////////// + +public: + + RealD csw_r; + RealD csw_t; + RealD cF; + + bool open_boundaries; + + CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd; + CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd; + + CloverTriangleField Triangle, TriangleEven, TriangleOdd; + CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd; + + FermionField Tmp; + + MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd; +}; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/WilsonCloverHelpers.h b/Grid/qcd/action/fermion/WilsonCloverHelpers.h index 337af647..588525cc 100644 --- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h +++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h @@ -208,4 +208,554 @@ public: } }; + +template class CompactWilsonCloverHelpers { +public: + + INHERIT_COMPACT_CLOVER_SIZES(Impl); + + INHERIT_IMPL_TYPES(Impl); + INHERIT_CLOVER_TYPES(Impl); + INHERIT_COMPACT_CLOVER_TYPES(Impl); + + #if 0 + static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) { + assert(i != j); + if(i < j) { + return triangle()(block)(triangle_index(i, j)); + } else { // i > j + return conjugate(triangle()(block)(triangle_index(i, j))); + } + } + #else + template + static accelerator_inline vobj triangle_elem(const iImplCloverTriangle& triangle, int block, int i, int j) { + assert(i != j); + if(i < j) { + return triangle()(block)(triangle_index(i, j)); + } else { // i > j + return conjugate(triangle()(block)(triangle_index(i, j))); + } + } + #endif + + static accelerator_inline int triangle_index(int i, int j) { + if(i == j) + return 0; + else if(i < j) + return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1; + else // i > j + return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1; + } + + static void MooeeKernel_gpu(int Nsite, + int Ls, + const FermionField& in, + FermionField& out, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle) { + autoView(diagonal_v, diagonal, AcceleratorRead); + autoView(triangle_v, triangle, AcceleratorRead); + autoView(in_v, in, AcceleratorRead); + autoView(out_v, out, AcceleratorWrite); + + typedef decltype(coalescedRead(out_v[0])) CalcSpinor; + + const uint64_t NN = Nsite * Ls; + + accelerator_for(ss, NN, Simd::Nsimd(), { + int sF = ss; + int sU = ss/Ls; + CalcSpinor res; + CalcSpinor in_t = in_v(sF); + auto diagonal_t = diagonal_v(sU); + auto triangle_t = triangle_v(sU); + for(int block=0; block penalty -> disable */ \ + \ + if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \ + base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \ + svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL1STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \ + } \ + \ + if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \ + base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \ + svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL2STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \ + } \ + } +// TODO: Implement/generalize this for other architectures +// I played around a bit on KNL (see below) but didn't bring anything +// #elif defined(AVX512) +// #define PREFETCH_CLOVER(BASE) { \ +// uint64_t base; \ +// int pf_dist_L1 = 1; \ +// int pf_dist_L2 = +4; \ +// \ +// if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \ +// base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \ +// _mm_prefetch((const char*)(base + 0), _MM_HINT_T0); \ +// _mm_prefetch((const char*)(base + 64), _MM_HINT_T0); \ +// _mm_prefetch((const char*)(base + 128), _MM_HINT_T0); \ +// _mm_prefetch((const char*)(base + 192), _MM_HINT_T0); \ +// _mm_prefetch((const char*)(base + 256), _MM_HINT_T0); \ +// _mm_prefetch((const char*)(base + 320), _MM_HINT_T0); \ +// } \ +// \ +// if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \ +// base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \ +// _mm_prefetch((const char*)(base + 0), _MM_HINT_T1); \ +// _mm_prefetch((const char*)(base + 64), _MM_HINT_T1); \ +// _mm_prefetch((const char*)(base + 128), _MM_HINT_T1); \ +// _mm_prefetch((const char*)(base + 192), _MM_HINT_T1); \ +// _mm_prefetch((const char*)(base + 256), _MM_HINT_T1); \ +// _mm_prefetch((const char*)(base + 320), _MM_HINT_T1); \ +// } \ +// } +#else +#define PREFETCH_CLOVER(BASE) +#endif + + const uint64_t NN = Nsite * Ls; + + thread_for(ss, NN, { + int sF = ss; + int sU = ss/Ls; + CalcSpinor res; + CalcSpinor in_t = in_v[sF]; + auto diag_t = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read + auto triangle_t = triangle_v[sU]; + + // upper half + PREFETCH_CLOVER(0); + + auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number + auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from + auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20 + auto in_cc_1_0 = conjugate(in_t()(1)(0)); + auto in_cc_1_1 = conjugate(in_t()(1)(1)); + + res()(0)(0) = diag_t()(0)( 0) * in_t()(0)(0) + + triangle_t()(0)( 0) * in_t()(0)(1) + + triangle_t()(0)( 1) * in_t()(0)(2) + + triangle_t()(0)( 2) * in_t()(1)(0) + + triangle_t()(0)( 3) * in_t()(1)(1) + + triangle_t()(0)( 4) * in_t()(1)(2); + + res()(0)(1) = triangle_t()(0)( 0) * in_cc_0_0; + res()(0)(1) = diag_t()(0)( 1) * in_t()(0)(1) + + triangle_t()(0)( 5) * in_t()(0)(2) + + triangle_t()(0)( 6) * in_t()(1)(0) + + triangle_t()(0)( 7) * in_t()(1)(1) + + triangle_t()(0)( 8) * in_t()(1)(2) + + conjugate( res()(0)( 1)); + + res()(0)(2) = triangle_t()(0)( 1) * in_cc_0_0 + + triangle_t()(0)( 5) * in_cc_0_1; + res()(0)(2) = diag_t()(0)( 2) * in_t()(0)(2) + + triangle_t()(0)( 9) * in_t()(1)(0) + + triangle_t()(0)(10) * in_t()(1)(1) + + triangle_t()(0)(11) * in_t()(1)(2) + + conjugate( res()(0)( 2)); + + res()(1)(0) = triangle_t()(0)( 2) * in_cc_0_0 + + triangle_t()(0)( 6) * in_cc_0_1 + + triangle_t()(0)( 9) * in_cc_0_2; + res()(1)(0) = diag_t()(0)( 3) * in_t()(1)(0) + + triangle_t()(0)(12) * in_t()(1)(1) + + triangle_t()(0)(13) * in_t()(1)(2) + + conjugate( res()(1)( 0)); + + res()(1)(1) = triangle_t()(0)( 3) * in_cc_0_0 + + triangle_t()(0)( 7) * in_cc_0_1 + + triangle_t()(0)(10) * in_cc_0_2 + + triangle_t()(0)(12) * in_cc_1_0; + res()(1)(1) = diag_t()(0)( 4) * in_t()(1)(1) + + triangle_t()(0)(14) * in_t()(1)(2) + + conjugate( res()(1)( 1)); + + res()(1)(2) = triangle_t()(0)( 4) * in_cc_0_0 + + triangle_t()(0)( 8) * in_cc_0_1 + + triangle_t()(0)(11) * in_cc_0_2 + + triangle_t()(0)(13) * in_cc_1_0 + + triangle_t()(0)(14) * in_cc_1_1; + res()(1)(2) = diag_t()(0)( 5) * in_t()(1)(2) + + conjugate( res()(1)( 2)); + + vstream(out_v[sF]()(0)(0), res()(0)(0)); + vstream(out_v[sF]()(0)(1), res()(0)(1)); + vstream(out_v[sF]()(0)(2), res()(0)(2)); + vstream(out_v[sF]()(1)(0), res()(1)(0)); + vstream(out_v[sF]()(1)(1), res()(1)(1)); + vstream(out_v[sF]()(1)(2), res()(1)(2)); + + // lower half + PREFETCH_CLOVER(1); + + auto in_cc_2_0 = conjugate(in_t()(2)(0)); + auto in_cc_2_1 = conjugate(in_t()(2)(1)); + auto in_cc_2_2 = conjugate(in_t()(2)(2)); + auto in_cc_3_0 = conjugate(in_t()(3)(0)); + auto in_cc_3_1 = conjugate(in_t()(3)(1)); + + res()(2)(0) = diag_t()(1)( 0) * in_t()(2)(0) + + triangle_t()(1)( 0) * in_t()(2)(1) + + triangle_t()(1)( 1) * in_t()(2)(2) + + triangle_t()(1)( 2) * in_t()(3)(0) + + triangle_t()(1)( 3) * in_t()(3)(1) + + triangle_t()(1)( 4) * in_t()(3)(2); + + res()(2)(1) = triangle_t()(1)( 0) * in_cc_2_0; + res()(2)(1) = diag_t()(1)( 1) * in_t()(2)(1) + + triangle_t()(1)( 5) * in_t()(2)(2) + + triangle_t()(1)( 6) * in_t()(3)(0) + + triangle_t()(1)( 7) * in_t()(3)(1) + + triangle_t()(1)( 8) * in_t()(3)(2) + + conjugate( res()(2)( 1)); + + res()(2)(2) = triangle_t()(1)( 1) * in_cc_2_0 + + triangle_t()(1)( 5) * in_cc_2_1; + res()(2)(2) = diag_t()(1)( 2) * in_t()(2)(2) + + triangle_t()(1)( 9) * in_t()(3)(0) + + triangle_t()(1)(10) * in_t()(3)(1) + + triangle_t()(1)(11) * in_t()(3)(2) + + conjugate( res()(2)( 2)); + + res()(3)(0) = triangle_t()(1)( 2) * in_cc_2_0 + + triangle_t()(1)( 6) * in_cc_2_1 + + triangle_t()(1)( 9) * in_cc_2_2; + res()(3)(0) = diag_t()(1)( 3) * in_t()(3)(0) + + triangle_t()(1)(12) * in_t()(3)(1) + + triangle_t()(1)(13) * in_t()(3)(2) + + conjugate( res()(3)( 0)); + + res()(3)(1) = triangle_t()(1)( 3) * in_cc_2_0 + + triangle_t()(1)( 7) * in_cc_2_1 + + triangle_t()(1)(10) * in_cc_2_2 + + triangle_t()(1)(12) * in_cc_3_0; + res()(3)(1) = diag_t()(1)( 4) * in_t()(3)(1) + + triangle_t()(1)(14) * in_t()(3)(2) + + conjugate( res()(3)( 1)); + + res()(3)(2) = triangle_t()(1)( 4) * in_cc_2_0 + + triangle_t()(1)( 8) * in_cc_2_1 + + triangle_t()(1)(11) * in_cc_2_2 + + triangle_t()(1)(13) * in_cc_3_0 + + triangle_t()(1)(14) * in_cc_3_1; + res()(3)(2) = diag_t()(1)( 5) * in_t()(3)(2) + + conjugate( res()(3)( 2)); + + vstream(out_v[sF]()(2)(0), res()(2)(0)); + vstream(out_v[sF]()(2)(1), res()(2)(1)); + vstream(out_v[sF]()(2)(2), res()(2)(2)); + vstream(out_v[sF]()(3)(0), res()(3)(0)); + vstream(out_v[sF]()(3)(1), res()(3)(1)); + vstream(out_v[sF]()(3)(2), res()(3)(2)); + }); + } + + static void MooeeKernel(int Nsite, + int Ls, + const FermionField& in, + FermionField& out, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle) { +#if defined(GRID_CUDA) || defined(GRID_HIP) + MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle); +#else + MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle); +#endif + } + + static void Invert(const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle, + CloverDiagonalField& diagonalInv, + CloverTriangleField& triangleInv) { + conformable(diagonal, diagonalInv); + conformable(triangle, triangleInv); + conformable(diagonal, triangle); + + diagonalInv.Checkerboard() = diagonal.Checkerboard(); + triangleInv.Checkerboard() = triangle.Checkerboard(); + + GridBase* grid = diagonal.Grid(); + + long lsites = grid->lSites(); + + typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal; + typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle; + + autoView(diagonal_v, diagonal, CpuRead); + autoView(triangle_v, triangle, CpuRead); + autoView(diagonalInv_v, diagonalInv, CpuWrite); + autoView(triangleInv_v, triangleInv, CpuWrite); + + thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite + Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc); + Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc); + + scalar_object_diagonal diagonal_tmp = Zero(); + scalar_object_diagonal diagonal_inv_tmp = Zero(); + scalar_object_triangle triangle_tmp = Zero(); + scalar_object_triangle triangle_inv_tmp = Zero(); + + Coordinate lcoor; + grid->LocalIndexToLocalCoor(site, lcoor); + + peekLocalSite(diagonal_tmp, diagonal_v, lcoor); + peekLocalSite(triangle_tmp, triangle_v, lcoor); + + // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately? + for (long s_row=0;s_row 1 || s_row + s_col == 3) continue; + int block = s_row / Nhs; + int s_row_block = s_row % Nhs; + int s_col_block = s_col % Nhs; + for (long c_row=0;c_row(TensorRemove(diagonal_tmp()(block)(i))); + else + clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast(TensorRemove(triangle_elem(triangle_tmp, block, i, j))); + } + } + } + } + + clover_inv_eigen = clover_eigen.inverse(); + + for (long s_row=0;s_row 1 || s_row + s_col == 3) continue; + int block = s_row / Nhs; + int s_row_block = s_row % Nhs; + int s_col_block = s_col % Nhs; + for (long c_row=0;c_rowoSites(), 1, { + for(int s_row = 0; s_row < Ns; s_row++) { + for(int s_col = 0; s_col < Ns; s_col++) { + if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue; + int block = s_row / Nhs; + int s_row_block = s_row % Nhs; + int s_col_block = s_col % Nhs; + for(int c_row = 0; c_row < Nc; c_row++) { + for(int c_col = 0; c_col < Nc; c_col++) { + int i = s_row_block * Nc + c_row; + int j = s_col_block * Nc + c_col; + if(i == j) + diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col); + else if(i < j) + triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col); + else + continue; + } + } + } + } + }); + } + + + static void ConvertLayout(const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle, + CloverField& full) { + conformable(full, diagonal); + conformable(full, triangle); + + full.Checkerboard() = diagonal.Checkerboard(); + + full = Zero(); + + autoView(diagonal_v, diagonal, AcceleratorRead); + autoView(triangle_v, triangle, AcceleratorRead); + autoView(full_v, full, AcceleratorWrite); + + // NOTE: this function cannot be 'private' since nvcc forbids this for kernels + accelerator_for(ss, full.Grid()->oSites(), 1, { + for(int s_row = 0; s_row < Ns; s_row++) { + for(int s_col = 0; s_col < Ns; s_col++) { + if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue; + int block = s_row / Nhs; + int s_row_block = s_row % Nhs; + int s_col_block = s_col % Nhs; + for(int c_row = 0; c_row < Nc; c_row++) { + for(int c_col = 0; c_col < Nc; c_col++) { + int i = s_row_block * Nc + c_row; + int j = s_col_block * Nc + c_col; + if(i == j) + full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i); + else + full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j); + } + } + } + } + }); + } + + static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) { + // Checks/grid + double t0 = usecond(); + conformable(diagonal, triangle); + GridBase* grid = diagonal.Grid(); + + // Determine the boundary coordinates/sites + double t1 = usecond(); + int t_dir = Nd - 1; + Lattice> t_coor(grid); + LatticeCoordinate(t_coor, t_dir); + int T = grid->GlobalDimensions()[t_dir]; + + // Set off-diagonal parts at boundary to zero -- OK + double t2 = usecond(); + CloverTriangleField zeroTriangle(grid); + zeroTriangle.Checkerboard() = triangle.Checkerboard(); + zeroTriangle = Zero(); + triangle = where(t_coor == 0, zeroTriangle, triangle); + triangle = where(t_coor == T-1, zeroTriangle, triangle); + + // Set diagonal to unity (scaled correctly) -- OK + double t3 = usecond(); + CloverDiagonalField tmp(grid); + tmp.Checkerboard() = diagonal.Checkerboard(); + tmp = -1.0 * csw_t + diag_mass; + diagonal = where(t_coor == 0, tmp, diagonal); + diagonal = where(t_coor == T-1, tmp, diagonal); + + // Correct values next to boundary + double t4 = usecond(); + if(cF != 1.0) { + tmp = cF - 1.0; + tmp += diagonal; + diagonal = where(t_coor == 1, tmp, diagonal); + diagonal = where(t_coor == T-2, tmp, diagonal); + } + + // Report timings + double t5 = usecond(); +#if 0 + std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:" + << " checks = " << (t1 - t0) / 1e6 + << ", coordinate = " << (t2 - t1) / 1e6 + << ", off-diag zero = " << (t3 - t2) / 1e6 + << ", diagonal unity = " << (t4 - t3) / 1e6 + << ", near-boundary = " << (t5 - t4) / 1e6 + << ", total = " << (t5 - t0) / 1e6 + << std::endl; +#endif + } + + template + static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) { + conformable(f, m); + auto grid = f.Grid(); + const int Nsite = grid->oSites(); + const int Nsimd = grid->Nsimd(); + autoView(f_v, f, AcceleratorWrite); + autoView(m_v, m, AcceleratorRead); + // NOTE: this function cannot be 'private' since nvcc forbids this for kernels + accelerator_for(ss, Nsite, Nsimd, { + coalescedWrite(f_v[ss], m_v(ss) * f_v(ss)); + }); + } + + template + static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) { + assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even); + assert(odd.Grid()->_isCheckerBoarded && odd.Checkerboard() == Odd); + assert(!full.Grid()->_isCheckerBoarded); + + GridBase* grid = full.Grid(); + int t_dir = Nd-1; + Lattice> t_coor(grid); + LatticeCoordinate(t_coor, t_dir); + int T = grid->GlobalDimensions()[t_dir]; + + MaskField zeroMask(grid); zeroMask = Zero(); + full = 1.0; + full = where(t_coor == 0, zeroMask, full); + full = where(t_coor == T-1, zeroMask, full); + + pickCheckerboard(Even, even, full); + pickCheckerboard(Odd, odd, full); + } +}; + NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/WilsonCloverTypes.h b/Grid/qcd/action/fermion/WilsonCloverTypes.h index 4428259f..a5a95d93 100644 --- a/Grid/qcd/action/fermion/WilsonCloverTypes.h +++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h @@ -42,8 +42,51 @@ public: typedef Lattice CloverField; }; +template +class CompactWilsonCloverTypes { +public: + INHERIT_IMPL_TYPES(Impl); + + static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions"); + + static constexpr int Nred = Nc * Nhs; // 6 + static constexpr int Nblock = Nhs; // 2 + static constexpr int Ndiagonal = Nred; // 6 + static constexpr int Ntriangle = (Nred - 1) * Nc; // 15 + + template using iImplCloverDiagonal = iScalar, Nblock>>; + template using iImplCloverTriangle = iScalar, Nblock>>; + + typedef iImplCloverDiagonal SiteCloverDiagonal; + typedef iImplCloverTriangle SiteCloverTriangle; + typedef iSinglet SiteMask; + + typedef Lattice CloverDiagonalField; + typedef Lattice CloverTriangleField; + typedef Lattice MaskField; +}; + #define INHERIT_CLOVER_TYPES(Impl) \ typedef typename WilsonCloverTypes::SiteClover SiteClover; \ typedef typename WilsonCloverTypes::CloverField CloverField; +#define INHERIT_COMPACT_CLOVER_TYPES(Impl) \ + typedef typename CompactWilsonCloverTypes::SiteCloverDiagonal SiteCloverDiagonal; \ + typedef typename CompactWilsonCloverTypes::SiteCloverTriangle SiteCloverTriangle; \ + typedef typename CompactWilsonCloverTypes::SiteMask SiteMask; \ + typedef typename CompactWilsonCloverTypes::CloverDiagonalField CloverDiagonalField; \ + typedef typename CompactWilsonCloverTypes::CloverTriangleField CloverTriangleField; \ + typedef typename CompactWilsonCloverTypes::MaskField MaskField; \ + /* ugly duplication but needed inside functionality classes */ \ + template using iImplCloverDiagonal = \ + iScalar::Ndiagonal>, CompactWilsonCloverTypes::Nblock>>; \ + template using iImplCloverTriangle = \ + iScalar::Ntriangle>, CompactWilsonCloverTypes::Nblock>>; + +#define INHERIT_COMPACT_CLOVER_SIZES(Impl) \ + static constexpr int Nred = CompactWilsonCloverTypes::Nred; \ + static constexpr int Nblock = CompactWilsonCloverTypes::Nblock; \ + static constexpr int Ndiagonal = CompactWilsonCloverTypes::Ndiagonal; \ + static constexpr int Ntriangle = CompactWilsonCloverTypes::Ntriangle; + NAMESPACE_END(Grid); diff --git a/tests/core/Test_compact_wilson_clover_speedup.cc b/tests/core/Test_compact_wilson_clover_speedup.cc new file mode 100644 index 00000000..83e3da1f --- /dev/null +++ b/tests/core/Test_compact_wilson_clover_speedup.cc @@ -0,0 +1,226 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/core/Test_compact_wilson_clover_speedup.cc + + Copyright (C) 2020 - 2022 + + Author: Daniel Richtmann + Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +using namespace Grid; + +NAMESPACE_BEGIN(CommandlineHelpers); + +static bool checkPresent(int* argc, char*** argv, const std::string& option) { + return GridCmdOptionExists(*argv, *argv + *argc, option); +} + +static std::string getContent(int* argc, char*** argv, const std::string& option) { + return GridCmdOptionPayload(*argv, *argv + *argc, option); +} + +static int readInt(int* argc, char*** argv, std::string&& option, int defaultValue) { + std::string arg; + int ret = defaultValue; + if(checkPresent(argc, argv, option)) { + arg = getContent(argc, argv, option); + GridCmdOptionInt(arg, ret); + } + return ret; +} + +static float readFloat(int* argc, char*** argv, std::string&& option, float defaultValue) { + std::string arg; + float ret = defaultValue; + if(checkPresent(argc, argv, option)) { + arg = getContent(argc, argv, option); + GridCmdOptionFloat(arg, ret); + } + return ret; +} + +NAMESPACE_END(CommandlineHelpers); + + +#define _grid_printf(LOGGER, ...) \ + { \ + if((LOGGER).isActive()) { /* this makes it safe to put, e.g., norm2 in the calling code w.r.t. performance */ \ + char _printf_buf[1024]; \ + std::sprintf(_printf_buf, __VA_ARGS__); \ + std::cout << (LOGGER) << _printf_buf; \ + fflush(stdout); \ + } \ + } +#define grid_printf_msg(...) _grid_printf(GridLogMessage, __VA_ARGS__) + + +template +bool resultsAgree(const Field& ref, const Field& res, const std::string& name) { + RealD checkTolerance = (getPrecision::value == 2) ? 1e-15 : 1e-7; + Field diff(ref.Grid()); + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage + << "norm2(reference), norm2(" << name << "), abs. deviation, rel. deviation: " << norm2(ref) << " " + << norm2(res) << " " << absDev << " " << relDev << " -> check " + << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + + return relDev <= checkTolerance; +} + + +template +void runBenchmark(int* argc, char*** argv) { + // read from command line + const int nIter = CommandlineHelpers::readInt( argc, argv, "--niter", 1000); + const RealD mass = CommandlineHelpers::readFloat( argc, argv, "--mass", 0.5); + const RealD csw = CommandlineHelpers::readFloat( argc, argv, "--csw", 1.0); + const RealD cF = CommandlineHelpers::readFloat( argc, argv, "--cF", 1.0); + const bool antiPeriodic = CommandlineHelpers::checkPresent(argc, argv, "--antiperiodic"); + + // precision + static_assert(getPrecision::value == 2 || getPrecision::value == 1, "Incorrect precision"); // double or single + std::string precision = (getPrecision::value == 2 ? "double" : "single"); + + // setup grids + GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vCoeff_t::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + // clang-format on + + // setup rng + std::vector seeds({1, 2, 3, 4}); + GridParallelRNG pRNG(UGrid); + pRNG.SeedFixedIntegers(seeds); + + // type definitions + typedef WilsonImpl WImpl; + typedef WilsonCloverFermion WilsonCloverOperator; + typedef CompactWilsonCloverFermion CompactWilsonCloverOperator; + typedef typename WilsonCloverOperator::FermionField Fermion; + typedef typename WilsonCloverOperator::GaugeField Gauge; + + // setup fields + Fermion src(UGrid); random(pRNG, src); + Fermion ref(UGrid); ref = Zero(); + Fermion res(UGrid); res = Zero(); + Fermion hop(UGrid); hop = Zero(); + Fermion diff(UGrid); diff = Zero(); + Gauge Umu(UGrid); SU3::HotConfiguration(pRNG, Umu); + + // setup boundary phases + typename WilsonCloverOperator::ImplParams implParams; + std::vector boundary_phases(Nd, 1.); + if(antiPeriodic) boundary_phases[Nd-1] = -1.; + implParams.boundary_phases = boundary_phases; + WilsonAnisotropyCoefficients anisParams; + + // misc stuff needed for benchmarks + double volume=1.0; for(int mu=0; mu_fdimensions[mu]; + + // setup fermion operators + WilsonCloverOperator Dwc( Umu, *UGrid, *UrbGrid, mass, csw, csw, anisParams, implParams); + CompactWilsonCloverOperator Dwc_compact(Umu, *UGrid, *UrbGrid, mass, csw, csw, cF, anisParams, implParams); + + // now test the conversions + typename CompactWilsonCloverOperator::CloverField tmp_ref(UGrid); tmp_ref = Dwc.CloverTerm; + typename CompactWilsonCloverOperator::CloverField tmp_res(UGrid); tmp_res = Zero(); + typename CompactWilsonCloverOperator::CloverField tmp_diff(UGrid); tmp_diff = Zero(); + typename CompactWilsonCloverOperator::CloverDiagonalField diagonal(UGrid); diagonal = Zero(); + typename CompactWilsonCloverOperator::CloverTriangleField triangle(UGrid); diagonal = Zero(); + CompactWilsonCloverOperator::CompactHelpers::ConvertLayout(tmp_ref, diagonal, triangle); + CompactWilsonCloverOperator::CompactHelpers::ConvertLayout(diagonal, triangle, tmp_res); + tmp_diff = tmp_ref - tmp_res; + std::cout << GridLogMessage << "conversion: ref, res, diff, eps" + << " " << norm2(tmp_ref) + << " " << norm2(tmp_res) + << " " << norm2(tmp_diff) + << " " << norm2(tmp_diff) / norm2(tmp_ref) + << std::endl; + + // performance per site (use minimal values necessary) + double hop_flop_per_site = 1320; // Rich's Talk + what Peter uses + double hop_byte_per_site = (8 * 9 + 9 * 12) * 2 * getPrecision::value * 4; + double clov_flop_per_site = 504; // Rich's Talk and 1412.2629 + double clov_byte_per_site = (2 * 18 + 12 + 12) * 2 * getPrecision::value * 4; + double clov_flop_per_site_performed = 1128; + double clov_byte_per_site_performed = (12 * 12 + 12 + 12) * 2 * getPrecision::value * 4; + + // total performance numbers + double hop_gflop_total = volume * nIter * hop_flop_per_site / 1e9; + double hop_gbyte_total = volume * nIter * hop_byte_per_site / 1e9; + double clov_gflop_total = volume * nIter * clov_flop_per_site / 1e9; + double clov_gbyte_total = volume * nIter * clov_byte_per_site / 1e9; + double clov_gflop_performed_total = volume * nIter * clov_flop_per_site_performed / 1e9; + double clov_gbyte_performed_total = volume * nIter * clov_byte_per_site_performed / 1e9; + + // warmup + measure dhop + for(auto n : {1, 2, 3, 4, 5}) Dwc.Dhop(src, hop, 0); + double t0 = usecond(); + for(int n = 0; n < nIter; n++) Dwc.Dhop(src, hop, 0); + double t1 = usecond(); + double secs_hop = (t1-t0)/1e6; + grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", + "hop", precision.c_str(), secs_hop, hop_gflop_total/secs_hop, hop_gbyte_total/secs_hop, 0.0, secs_hop/secs_hop); + +#define BENCH_CLOVER_KERNEL(KERNEL) { \ + /* warmup + measure reference clover */ \ + for(auto n : {1, 2, 3, 4, 5}) Dwc.KERNEL(src, ref); \ + double t2 = usecond(); \ + for(int n = 0; n < nIter; n++) Dwc.KERNEL(src, ref); \ + double t3 = usecond(); \ + double secs_ref = (t3-t2)/1e6; \ + grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", \ + "reference_"#KERNEL, precision.c_str(), secs_ref, clov_gflop_total/secs_ref, clov_gbyte_total/secs_ref, secs_ref/secs_ref, secs_ref/secs_hop); \ + grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", /* to see how well the ET performs */ \ + "reference_"#KERNEL"_performed", precision.c_str(), secs_ref, clov_gflop_performed_total/secs_ref, clov_gbyte_performed_total/secs_ref, secs_ref/secs_ref, secs_ref/secs_hop); \ +\ + /* warmup + measure compact clover */ \ + for(auto n : {1, 2, 3, 4, 5}) Dwc_compact.KERNEL(src, res); \ + double t4 = usecond(); \ + for(int n = 0; n < nIter; n++) Dwc_compact.KERNEL(src, res); \ + double t5 = usecond(); \ + double secs_res = (t5-t4)/1e6; \ + grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", \ + "compact_"#KERNEL, precision.c_str(), secs_res, clov_gflop_total/secs_res, clov_gbyte_total/secs_res, secs_ref/secs_res, secs_res/secs_hop); \ + assert(resultsAgree(ref, res, #KERNEL)); \ +} + + BENCH_CLOVER_KERNEL(Mooee); + BENCH_CLOVER_KERNEL(MooeeDag); + BENCH_CLOVER_KERNEL(MooeeInv); + BENCH_CLOVER_KERNEL(MooeeInvDag); + + grid_printf_msg("finalize %s\n", precision.c_str()); +} + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + runBenchmark(&argc, &argv); + runBenchmark(&argc, &argv); + + Grid_finalize(); +} From 1b6b12589f7634ce1eecda5b3b5ce0a20897dfce Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Tue, 1 Feb 2022 22:41:01 +0100 Subject: [PATCH 23/30] Get splitting up into implementation and instantiation files correct --- .../fermion/CompactWilsonCloverFermion.h | 296 +------------- Grid/qcd/action/fermion/Fermion.h | 18 + ...CompactWilsonCloverFermionImplementation.h | 363 ++++++++++++++++++ ...WilsonCloverFermionInstantiation.cc.master | 41 ++ ...rFermionInstantiationGparityWilsonImplD.cc | 1 + ...rFermionInstantiationGparityWilsonImplF.cc | 1 + ...loverFermionInstantiationWilsonAdjImplD.cc | 1 + ...loverFermionInstantiationWilsonAdjImplF.cc | 1 + ...onCloverFermionInstantiationWilsonImplD.cc | 1 + ...onCloverFermionInstantiationWilsonImplF.cc | 1 + ...tiationWilsonTwoIndexAntiSymmetricImplD.cc | 1 + ...tiationWilsonTwoIndexAntiSymmetricImplF.cc | 1 + ...stantiationWilsonTwoIndexSymmetricImplD.cc | 1 + ...stantiationWilsonTwoIndexSymmetricImplF.cc | 1 + .../instantiation/generate_instantiations.sh | 2 +- 15 files changed, 454 insertions(+), 276 deletions(-) create mode 100644 Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h create mode 100644 Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master create mode 120000 Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc create mode 120000 Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc create mode 120000 Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc diff --git a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h index 550bef85..3a166134 100644 --- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h @@ -125,31 +125,7 @@ public: const RealD _csw_t = 0.0, const RealD _cF = 1.0, const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(), - const ImplParams& impl_p = ImplParams()) - : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy) - , csw_r(_csw_r) - , csw_t(_csw_t) - , cF(_cF) - , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0) - , Diagonal(&Fgrid), Triangle(&Fgrid) - , DiagonalEven(&Hgrid), TriangleEven(&Hgrid) - , DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid) - , DiagonalInv(&Fgrid), TriangleInv(&Fgrid) - , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid) - , DiagonalInvOdd(&Hgrid), TriangleInvOdd(&Hgrid) - , Tmp(&Fgrid) - , BoundaryMask(&Fgrid) - , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid) - { - csw_r *= 0.5; - csw_t *= 0.5; - if (clover_anisotropy.isAnisotropic) - csw_r /= clover_anisotropy.xi_0; - - ImportGauge(_Umu); - if (open_boundaries) - CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd); - } + const ImplParams& impl_p = ImplParams()); ///////////////////////////////////////////// // Member functions (implementing interface) @@ -161,191 +137,41 @@ public: int ConstEE() override { return 0; }; int isTrivialEE() override { return 0; }; + void Dhop(const FermionField& in, FermionField& out, int dag) override; - void Dhop(const FermionField& in, FermionField& out, int dag) override { - WilsonBase::Dhop(in, out, dag); - if(open_boundaries) ApplyBoundaryMask(out); - } + void DhopOE(const FermionField& in, FermionField& out, int dag) override; - void DhopOE(const FermionField& in, FermionField& out, int dag) override { - WilsonBase::DhopOE(in, out, dag); - if(open_boundaries) ApplyBoundaryMask(out); - } + void DhopEO(const FermionField& in, FermionField& out, int dag) override; - void DhopEO(const FermionField& in, FermionField& out, int dag) override { - WilsonBase::DhopEO(in, out, dag); - if(open_boundaries) ApplyBoundaryMask(out); - } + void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override; - void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override { - WilsonBase::DhopDir(in, out, dir, disp); - if(this->open_boundaries) ApplyBoundaryMask(out); - } + void DhopDirAll(const FermionField& in, std::vector& out) /* override */; - void DhopDirAll(const FermionField& in, std::vector& out) /* override */ { - WilsonBase::DhopDirAll(in, out); - if(this->open_boundaries) { - for(auto& o : out) ApplyBoundaryMask(o); - } - } + void M(const FermionField& in, FermionField& out) override; - void M(const FermionField& in, FermionField& out) override { - out.Checkerboard() = in.Checkerboard(); - WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc - Mooee(in, Tmp); - axpy(out, 1.0, out, Tmp); - if(open_boundaries) ApplyBoundaryMask(out); - } + void Mdag(const FermionField& in, FermionField& out) override; - void Mdag(const FermionField& in, FermionField& out) override { - out.Checkerboard() = in.Checkerboard(); - WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc - MooeeDag(in, Tmp); - axpy(out, 1.0, out, Tmp); - if(open_boundaries) ApplyBoundaryMask(out); - } + void Meooe(const FermionField& in, FermionField& out) override; - void Meooe(const FermionField& in, FermionField& out) override { - WilsonBase::Meooe(in, out); - if(open_boundaries) ApplyBoundaryMask(out); - } + void MeooeDag(const FermionField& in, FermionField& out) override; - void MeooeDag(const FermionField& in, FermionField& out) override { - WilsonBase::MeooeDag(in, out); - if(open_boundaries) ApplyBoundaryMask(out); - } + void Mooee(const FermionField& in, FermionField& out) override; - void Mooee(const FermionField& in, FermionField& out) override { - if(in.Grid()->_isCheckerBoarded) { - if(in.Checkerboard() == Odd) { - MooeeInternal(in, out, DiagonalOdd, TriangleOdd); - } else { - MooeeInternal(in, out, DiagonalEven, TriangleEven); - } - } else { - MooeeInternal(in, out, Diagonal, Triangle); - } - if(open_boundaries) ApplyBoundaryMask(out); - } + void MooeeDag(const FermionField& in, FermionField& out) override; - void MooeeDag(const FermionField& in, FermionField& out) override { - Mooee(in, out); // blocks are hermitian - } + void MooeeInv(const FermionField& in, FermionField& out) override; - void MooeeInv(const FermionField& in, FermionField& out) override { - if(in.Grid()->_isCheckerBoarded) { - if(in.Checkerboard() == Odd) { - MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd); - } else { - MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven); - } - } else { - MooeeInternal(in, out, DiagonalInv, TriangleInv); - } - if(open_boundaries) ApplyBoundaryMask(out); - } + void MooeeInvDag(const FermionField& in, FermionField& out) override; - void MooeeInvDag(const FermionField& in, FermionField& out) override { - MooeeInv(in, out); // blocks are hermitian - } + void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override; - void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override { - DhopDir(in, out, dir, disp); - } + void MdirAll(const FermionField& in, std::vector& out) override; - void MdirAll(const FermionField& in, std::vector& out) override { - DhopDirAll(in, out); - } + void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override; - void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override { - assert(!open_boundaries); // TODO check for changes required for open bc + void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override; - // NOTE: code copied from original clover term - conformable(X.Grid(), Y.Grid()); - conformable(X.Grid(), force.Grid()); - GaugeLinkField force_mu(force.Grid()), lambda(force.Grid()); - GaugeField clover_force(force.Grid()); - PropagatorField Lambda(force.Grid()); - - // Guido: Here we are hitting some performance issues: - // need to extract the components of the DoubledGaugeField - // for each call - // Possible solution - // Create a vector object to store them? (cons: wasting space) - std::vector U(Nd, this->Umu.Grid()); - - Impl::extractLinkField(U, this->Umu); - - force = Zero(); - // Derivative of the Wilson hopping term - this->DhopDeriv(force, X, Y, dag); - - /////////////////////////////////////////////////////////// - // Clover term derivative - /////////////////////////////////////////////////////////// - Impl::outerProductImpl(Lambda, X, Y); - //std::cout << "Lambda:" << Lambda << std::endl; - - Gamma::Algebra sigma[] = { - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::MinusSigmaZT}; - - /* - sigma_{\mu \nu}= - | 0 sigma[0] sigma[1] sigma[2] | - | sigma[3] 0 sigma[4] sigma[5] | - | sigma[6] sigma[7] 0 sigma[8] | - | sigma[9] sigma[10] sigma[11] 0 | - */ - - int count = 0; - clover_force = Zero(); - for (int mu = 0; mu < 4; mu++) - { - force_mu = Zero(); - for (int nu = 0; nu < 4; nu++) - { - if (mu == nu) - continue; - - RealD factor; - if (nu == 4 || mu == 4) - { - factor = 2.0 * csw_t; - } - else - { - factor = 2.0 * csw_r; - } - PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked - Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok - force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked - count++; - } - - pokeLorentz(clover_force, U[mu] * force_mu, mu); - } - //clover_force *= csw; - force += clover_force; - } - - void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override { - assert(0); - } - - void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override { - assert(0); - } + void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override; ///////////////////////////////////////////// // Member functions (internals) @@ -354,93 +180,13 @@ public: void MooeeInternal(const FermionField& in, FermionField& out, const CloverDiagonalField& diagonal, - const CloverTriangleField& triangle) { - assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); - out.Checkerboard() = in.Checkerboard(); - conformable(in, out); - conformable(in, diagonal); - conformable(in, triangle); - - CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle); - } + const CloverTriangleField& triangle); ///////////////////////////////////////////// // Helpers ///////////////////////////////////////////// - void ImportGauge(const GaugeField& _Umu) override { - // NOTE: parts copied from original implementation - - // Import gauge into base class - double t0 = usecond(); - WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that - - // Initialize temporary variables - double t1 = usecond(); - conformable(_Umu.Grid(), this->GaugeGrid()); - GridBase* grid = _Umu.Grid(); - typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); - CloverField TmpOriginal(grid); - - // Compute the field strength terms mu>nu - double t2 = usecond(); - WilsonLoops::FieldStrength(Bx, _Umu, Zdir, Ydir); - WilsonLoops::FieldStrength(By, _Umu, Zdir, Xdir); - WilsonLoops::FieldStrength(Bz, _Umu, Ydir, Xdir); - WilsonLoops::FieldStrength(Ex, _Umu, Tdir, Xdir); - WilsonLoops::FieldStrength(Ey, _Umu, Tdir, Ydir); - WilsonLoops::FieldStrength(Ez, _Umu, Tdir, Zdir); - - // Compute the Clover Operator acting on Colour and Spin - // multiply here by the clover coefficients for the anisotropy - double t3 = usecond(); - TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r; - TmpOriginal += Helpers::fillCloverXZ(By) * csw_r; - TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r; - TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t; - TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t; - TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t; - TmpOriginal += this->diag_mass; - - // Convert the data layout of the clover term - double t4 = usecond(); - CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); - - // Possible modify the boundary values - double t5 = usecond(); - if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); - - // Invert the clover term in the improved layout - double t6 = usecond(); - CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv); - - // Fill the remaining clover fields - double t7 = usecond(); - pickCheckerboard(Even, DiagonalEven, Diagonal); - pickCheckerboard(Even, TriangleEven, Triangle); - pickCheckerboard(Odd, DiagonalOdd, Diagonal); - pickCheckerboard(Odd, TriangleOdd, Triangle); - pickCheckerboard(Even, DiagonalInvEven, DiagonalInv); - pickCheckerboard(Even, TriangleInvEven, TriangleInv); - pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv); - pickCheckerboard(Odd, TriangleInvOdd, TriangleInv); - - // Report timings - double t8 = usecond(); -#if 0 - std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:" - << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 - << ", allocations = " << (t2 - t1) / 1e6 - << ", field strength = " << (t3 - t2) / 1e6 - << ", fill clover = " << (t4 - t3) / 1e6 - << ", convert = " << (t5 - t4) / 1e6 - << ", boundaries = " << (t6 - t5) / 1e6 - << ", inversions = " << (t7 - t6) / 1e6 - << ", pick cbs = " << (t8 - t7) / 1e6 - << ", total = " << (t8 - t0) / 1e6 - << std::endl; -#endif - } + void ImportGauge(const GaugeField& _Umu) override; ///////////////////////////////////////////// // Helpers diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index 59fc17d5..78cf7851 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -53,6 +53,7 @@ NAMESPACE_CHECK(Wilson); #include // 4d wilson like NAMESPACE_CHECK(WilsonTM); #include // 4d wilson clover fermions +#include // 4d compact wilson clover fermions NAMESPACE_CHECK(WilsonClover); #include // 5d base used by all 5d overlap types NAMESPACE_CHECK(Wilson5D); @@ -153,6 +154,23 @@ typedef WilsonCloverFermion WilsonCloverTwoInd typedef WilsonCloverFermion WilsonCloverTwoIndexAntiSymmetricFermionF; typedef WilsonCloverFermion WilsonCloverTwoIndexAntiSymmetricFermionD; +// Compact Clover fermions +typedef CompactWilsonCloverFermion CompactWilsonCloverFermionR; +typedef CompactWilsonCloverFermion CompactWilsonCloverFermionF; +typedef CompactWilsonCloverFermion CompactWilsonCloverFermionD; + +typedef CompactWilsonCloverFermion CompactWilsonCloverAdjFermionR; +typedef CompactWilsonCloverFermion CompactWilsonCloverAdjFermionF; +typedef CompactWilsonCloverFermion CompactWilsonCloverAdjFermionD; + +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexSymmetricFermionR; +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexSymmetricFermionF; +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexSymmetricFermionD; + +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexAntiSymmetricFermionR; +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexAntiSymmetricFermionF; +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexAntiSymmetricFermionD; + // Domain Wall fermions typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h new file mode 100644 index 00000000..3dfcb133 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h @@ -0,0 +1,363 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h + + Copyright (C) 2017 - 2022 + + Author: paboyle + Author: Guido Cossu + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include +#include +#include + +NAMESPACE_BEGIN(Grid); +template +CompactWilsonCloverFermion::CompactWilsonCloverFermion(GaugeField& _Umu, + GridCartesian& Fgrid, + GridRedBlackCartesian& Hgrid, + const RealD _mass, + const RealD _csw_r, + const RealD _csw_t, + const RealD _cF, + const WilsonAnisotropyCoefficients& clover_anisotropy, + const ImplParams& impl_p) + : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy) + , csw_r(_csw_r) + , csw_t(_csw_t) + , cF(_cF) + , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0) + , Diagonal(&Fgrid), Triangle(&Fgrid) + , DiagonalEven(&Hgrid), TriangleEven(&Hgrid) + , DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid) + , DiagonalInv(&Fgrid), TriangleInv(&Fgrid) + , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid) + , DiagonalInvOdd(&Hgrid), TriangleInvOdd(&Hgrid) + , Tmp(&Fgrid) + , BoundaryMask(&Fgrid) + , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid) +{ + csw_r *= 0.5; + csw_t *= 0.5; + if (clover_anisotropy.isAnisotropic) + csw_r /= clover_anisotropy.xi_0; + + ImportGauge(_Umu); + if (open_boundaries) + CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd); +} + +template +void CompactWilsonCloverFermion::Dhop(const FermionField& in, FermionField& out, int dag) { + WilsonBase::Dhop(in, out, dag); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::DhopOE(const FermionField& in, FermionField& out, int dag) { + WilsonBase::DhopOE(in, out, dag); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::DhopEO(const FermionField& in, FermionField& out, int dag) { + WilsonBase::DhopEO(in, out, dag); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) { + WilsonBase::DhopDir(in, out, dir, disp); + if(this->open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::DhopDirAll(const FermionField& in, std::vector& out) { + WilsonBase::DhopDirAll(in, out); + if(this->open_boundaries) { + for(auto& o : out) ApplyBoundaryMask(o); + } +} + +template +void CompactWilsonCloverFermion::M(const FermionField& in, FermionField& out) { + out.Checkerboard() = in.Checkerboard(); + WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc + Mooee(in, Tmp); + axpy(out, 1.0, out, Tmp); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::Mdag(const FermionField& in, FermionField& out) { + out.Checkerboard() = in.Checkerboard(); + WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc + MooeeDag(in, Tmp); + axpy(out, 1.0, out, Tmp); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::Meooe(const FermionField& in, FermionField& out) { + WilsonBase::Meooe(in, out); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::MeooeDag(const FermionField& in, FermionField& out) { + WilsonBase::MeooeDag(in, out); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::Mooee(const FermionField& in, FermionField& out) { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + MooeeInternal(in, out, DiagonalOdd, TriangleOdd); + } else { + MooeeInternal(in, out, DiagonalEven, TriangleEven); + } + } else { + MooeeInternal(in, out, Diagonal, Triangle); + } + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::MooeeDag(const FermionField& in, FermionField& out) { + Mooee(in, out); // blocks are hermitian +} + +template +void CompactWilsonCloverFermion::MooeeInv(const FermionField& in, FermionField& out) { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd); + } else { + MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven); + } + } else { + MooeeInternal(in, out, DiagonalInv, TriangleInv); + } + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::MooeeInvDag(const FermionField& in, FermionField& out) { + MooeeInv(in, out); // blocks are hermitian +} + +template +void CompactWilsonCloverFermion::Mdir(const FermionField& in, FermionField& out, int dir, int disp) { + DhopDir(in, out, dir, disp); +} + +template +void CompactWilsonCloverFermion::MdirAll(const FermionField& in, std::vector& out) { + DhopDirAll(in, out); +} + +template +void CompactWilsonCloverFermion::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) { + assert(!open_boundaries); // TODO check for changes required for open bc + + // NOTE: code copied from original clover term + conformable(X.Grid(), Y.Grid()); + conformable(X.Grid(), force.Grid()); + GaugeLinkField force_mu(force.Grid()), lambda(force.Grid()); + GaugeField clover_force(force.Grid()); + PropagatorField Lambda(force.Grid()); + + // Guido: Here we are hitting some performance issues: + // need to extract the components of the DoubledGaugeField + // for each call + // Possible solution + // Create a vector object to store them? (cons: wasting space) + std::vector U(Nd, this->Umu.Grid()); + + Impl::extractLinkField(U, this->Umu); + + force = Zero(); + // Derivative of the Wilson hopping term + this->DhopDeriv(force, X, Y, dag); + + /////////////////////////////////////////////////////////// + // Clover term derivative + /////////////////////////////////////////////////////////// + Impl::outerProductImpl(Lambda, X, Y); + //std::cout << "Lambda:" << Lambda << std::endl; + + Gamma::Algebra sigma[] = { + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::MinusSigmaZT}; + + /* + sigma_{\mu \nu}= + | 0 sigma[0] sigma[1] sigma[2] | + | sigma[3] 0 sigma[4] sigma[5] | + | sigma[6] sigma[7] 0 sigma[8] | + | sigma[9] sigma[10] sigma[11] 0 | + */ + + int count = 0; + clover_force = Zero(); + for (int mu = 0; mu < 4; mu++) + { + force_mu = Zero(); + for (int nu = 0; nu < 4; nu++) + { + if (mu == nu) + continue; + + RealD factor; + if (nu == 4 || mu == 4) + { + factor = 2.0 * csw_t; + } + else + { + factor = 2.0 * csw_r; + } + PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked + Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok + force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked + count++; + } + + pokeLorentz(clover_force, U[mu] * force_mu, mu); + } + //clover_force *= csw; + force += clover_force; +} + +template +void CompactWilsonCloverFermion::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) { + assert(0); +} + +template +void CompactWilsonCloverFermion::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) { + assert(0); +} + +template +void CompactWilsonCloverFermion::MooeeInternal(const FermionField& in, + FermionField& out, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle) { + assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); + out.Checkerboard() = in.Checkerboard(); + conformable(in, out); + conformable(in, diagonal); + conformable(in, triangle); + + CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle); +} + +template +void CompactWilsonCloverFermion::ImportGauge(const GaugeField& _Umu) { + // NOTE: parts copied from original implementation + + // Import gauge into base class + double t0 = usecond(); + WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that + + // Initialize temporary variables + double t1 = usecond(); + conformable(_Umu.Grid(), this->GaugeGrid()); + GridBase* grid = _Umu.Grid(); + typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); + CloverField TmpOriginal(grid); + + // Compute the field strength terms mu>nu + double t2 = usecond(); + WilsonLoops::FieldStrength(Bx, _Umu, Zdir, Ydir); + WilsonLoops::FieldStrength(By, _Umu, Zdir, Xdir); + WilsonLoops::FieldStrength(Bz, _Umu, Ydir, Xdir); + WilsonLoops::FieldStrength(Ex, _Umu, Tdir, Xdir); + WilsonLoops::FieldStrength(Ey, _Umu, Tdir, Ydir); + WilsonLoops::FieldStrength(Ez, _Umu, Tdir, Zdir); + + // Compute the Clover Operator acting on Colour and Spin + // multiply here by the clover coefficients for the anisotropy + double t3 = usecond(); + TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r; + TmpOriginal += Helpers::fillCloverXZ(By) * csw_r; + TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r; + TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t; + TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t; + TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t; + TmpOriginal += this->diag_mass; + + // Convert the data layout of the clover term + double t4 = usecond(); + CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); + + // Possible modify the boundary values + double t5 = usecond(); + if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); + + // Invert the clover term in the improved layout + double t6 = usecond(); + CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv); + + // Fill the remaining clover fields + double t7 = usecond(); + pickCheckerboard(Even, DiagonalEven, Diagonal); + pickCheckerboard(Even, TriangleEven, Triangle); + pickCheckerboard(Odd, DiagonalOdd, Diagonal); + pickCheckerboard(Odd, TriangleOdd, Triangle); + pickCheckerboard(Even, DiagonalInvEven, DiagonalInv); + pickCheckerboard(Even, TriangleInvEven, TriangleInv); + pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv); + pickCheckerboard(Odd, TriangleInvOdd, TriangleInv); + + // Report timings + double t8 = usecond(); +#if 0 + std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:" + << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 + << ", allocations = " << (t2 - t1) / 1e6 + << ", field strength = " << (t3 - t2) / 1e6 + << ", fill clover = " << (t4 - t3) / 1e6 + << ", convert = " << (t5 - t4) / 1e6 + << ", boundaries = " << (t6 - t5) / 1e6 + << ", inversions = " << (t7 - t6) / 1e6 + << ", pick cbs = " << (t8 - t7) / 1e6 + << ", total = " << (t8 - t0) / 1e6 + << std::endl; +#endif +} + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master new file mode 100644 index 00000000..7c91c252 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master @@ -0,0 +1,41 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master + + Copyright (C) 2017 - 2022 + + Author: paboyle + Author: Guido Cossu + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include +#include +#include +#include + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class CompactWilsonCloverFermion; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh index d7553cdb..a09fd420 100755 --- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh +++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh @@ -40,7 +40,7 @@ EOF done -CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation" +CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation" for impl in $WILSON_IMPL_LIST do From 86f4e179287d6b2fcf03cd19afcb213b8c959066 Mon Sep 17 00:00:00 2001 From: Julio Maia Date: Mon, 7 Feb 2022 11:29:37 -0600 Subject: [PATCH 24/30] Changing thread block order and adding launch_bounds --- Grid/threads/Accelerator.h | 40 +++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 5991db26..b427b304 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -342,7 +342,7 @@ extern hipStream_t copyStream; /*These routines define mapping from thread grid to loop & vector lane indexing */ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT - return hipThreadIdx_z; + return hipThreadIdx_x; #else return 0; #endif @@ -356,19 +356,41 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { { __VA_ARGS__;} \ }; \ int nt=acceleratorThreads(); \ - dim3 hip_threads(nt,1,nsimd); \ - dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ - hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ - 0,0, \ - num1,num2,nsimd,lambda); \ + dim3 hip_threads(nsimd, nt, 1); \ + dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ + if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \ + hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \ + 0,0, \ + num1,num2,nsimd, lambda); \ + } else { \ + hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ + 0,0, \ + num1,num2,nsimd, lambda); \ + } \ } + template __global__ +__launch_bounds__(64,1) +void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) +{ + // Following the same scheme as CUDA for now + uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; + uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; + uint64_t z = threadIdx.x; + if ( (x < numx) && (y __global__ +__launch_bounds__(1024,1) void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) { - uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x; - uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y; - uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z; + // Following the same scheme as CUDA for now + uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; + uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; + uint64_t z = threadIdx.x; if ( (x < numx) && (y Date: Mon, 14 Feb 2022 16:04:08 +0000 Subject: [PATCH 25/30] Dont instantiate an Nc=3 and non-GP hardwired code for other implementations --- .../CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc | 1 - .../CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc | 1 - .../CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc | 1 - .../CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc | 1 - ...CloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc | 1 - ...CloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc | 1 - ...lsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc | 1 - ...lsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc | 1 - 8 files changed, 8 deletions(-) delete mode 120000 Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc delete mode 120000 Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc deleted file mode 120000 index a739bb99..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc deleted file mode 120000 index a739bb99..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc deleted file mode 120000 index a739bb99..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc deleted file mode 120000 index a739bb99..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc deleted file mode 120000 index a739bb99..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc deleted file mode 120000 index a739bb99..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc deleted file mode 120000 index a739bb99..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc deleted file mode 120000 index a739bb99..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file From a3b022d469f087943be9c46c12169bcec0b5ce58 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 14 Feb 2022 15:09:08 -0500 Subject: [PATCH 26/30] Crusher compile --- systems/Crusher/config-command | 12 ++++++++++++ systems/Crusher/dwf.slurm | 26 ++++++++++++++++++++++++++ systems/Crusher/dwf4.slurm | 26 ++++++++++++++++++++++++++ systems/Crusher/dwf8.slurm | 27 +++++++++++++++++++++++++++ systems/Crusher/mpiwrapper.sh | 12 ++++++++++++ systems/Crusher/sourceme.sh | 5 +++++ 6 files changed, 108 insertions(+) create mode 100644 systems/Crusher/config-command create mode 100644 systems/Crusher/dwf.slurm create mode 100644 systems/Crusher/dwf4.slurm create mode 100644 systems/Crusher/dwf8.slurm create mode 100755 systems/Crusher/mpiwrapper.sh create mode 100644 systems/Crusher/sourceme.sh diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command new file mode 100644 index 00000000..564dbc96 --- /dev/null +++ b/systems/Crusher/config-command @@ -0,0 +1,12 @@ +../../configure --enable-comms=mpi-auto \ +--enable-unified=no \ +--enable-shm=nvlink \ +--enable-accelerator=hip \ +--enable-gen-simd-width=64 \ +--enable-simd=GPU \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=hipcc MPICXX=mpicxx \ +CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \ +--prefix=/ccs/home/chulwoo/Grid \ + LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa " diff --git a/systems/Crusher/dwf.slurm b/systems/Crusher/dwf.slurm new file mode 100644 index 00000000..7144a270 --- /dev/null +++ b/systems/Crusher/dwf.slurm @@ -0,0 +1,26 @@ +#!/bin/bash +# Begin LSF Directives +#SBATCH -A LGT104 +#SBATCH -t 01:00:00 +##SBATCH -U openmpThu +#SBATCH -p ecp +#SBATCH -J DWF +#SBATCH -o DWF.%J +#SBATCH -e DWF.%J +#SBATCH -N 1 +#SBATCH -n 1 + +DIR=. +module list +export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +export MPICH_GPU_SUPPORT_ENABLED=1 +#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +#export MPICH_SMP_SINGLE_COPY_MODE=NONE +export MPICH_SMP_SINGLE_COPY_MODE=CMA +export OMP_NUM_THREADS=8 + +AT=8 +echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE +PARAMS=" --accelerator-threads ${AT} --grid 32.32.32.32 --mpi 1.1.1.1 --comms-overlap" +srun -n1 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS + diff --git a/systems/Crusher/dwf4.slurm b/systems/Crusher/dwf4.slurm new file mode 100644 index 00000000..9aebb480 --- /dev/null +++ b/systems/Crusher/dwf4.slurm @@ -0,0 +1,26 @@ +#!/bin/bash +# Begin LSF Directives +#SBATCH -A LGT104 +#SBATCH -t 01:00:00 +##SBATCH -U openmpThu +#SBATCH -p ecp +#SBATCH -J DWF +#SBATCH -o DWF.%J +#SBATCH -e DWF.%J +#SBATCH -N 1 +#SBATCH -n 4 + +DIR=. +module list +export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +export MPICH_GPU_SUPPORT_ENABLED=1 +#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +export MPICH_SMP_SINGLE_COPY_MODE=NONE +#export MPICH_SMP_SINGLE_COPY_MODE=CMA +export OMP_NUM_THREADS=4 + +AT=8 +echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE +PARAMS=" --accelerator-threads ${AT} --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0" +srun -n4 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS + diff --git a/systems/Crusher/dwf8.slurm b/systems/Crusher/dwf8.slurm new file mode 100644 index 00000000..4bf22d16 --- /dev/null +++ b/systems/Crusher/dwf8.slurm @@ -0,0 +1,27 @@ +#!/bin/bash +# Begin LSF Directives +#SBATCH -A LGT104 +#SBATCH -t 00:20:00 +#SBATCH -p ecp +#SBATCH -J DWF +#SBATCH -o DWF.%J +#SBATCH -e DWF.%J +#SBATCH -N 1 +#SBATCH -n 8 + + +DIR=. +module list +export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +export MPICH_GPU_SUPPORT_ENABLED=1 +#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +export MPICH_SMP_SINGLE_COPY_MODE=NONE +#export MPICH_SMP_SINGLE_COPY_MODE=CMA +export OMP_NUM_THREADS=1 + +AT=8 +echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE +PARAMS=" --accelerator-threads ${AT} --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0" + +srun -n8 -c 1 --gpus-per-task=1 --gpu-bind=closest --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS + diff --git a/systems/Crusher/mpiwrapper.sh b/systems/Crusher/mpiwrapper.sh new file mode 100755 index 00000000..76c4e364 --- /dev/null +++ b/systems/Crusher/mpiwrapper.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +lrank=$SLURM_LOCALID + +export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID + +echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING" + +$* + + + diff --git a/systems/Crusher/sourceme.sh b/systems/Crusher/sourceme.sh new file mode 100644 index 00000000..72a2ff4e --- /dev/null +++ b/systems/Crusher/sourceme.sh @@ -0,0 +1,5 @@ +module load PrgEnv-gnu +module load rocm/4.5.0 +module load gmp +module load cray-fftw +module load craype-accel-amd-gfx908 From f49d5c2d22dabe503407a16d1d32e6d68079e3e3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 14 Feb 2022 17:55:16 -0500 Subject: [PATCH 27/30] Updated scripts for crusher --- systems/Crusher/config-command | 2 +- systems/Crusher/dwf.slurm | 18 +++++++++++------- systems/Crusher/dwf4.slurm | 9 +++++---- systems/Crusher/dwf8.slurm | 12 ++++++------ systems/Crusher/sourceme.sh | 2 +- 5 files changed, 24 insertions(+), 19 deletions(-) diff --git a/systems/Crusher/config-command b/systems/Crusher/config-command index 564dbc96..90737808 100644 --- a/systems/Crusher/config-command +++ b/systems/Crusher/config-command @@ -8,5 +8,5 @@ --disable-gparity \ CXX=hipcc MPICXX=mpicxx \ CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \ ---prefix=/ccs/home/chulwoo/Grid \ LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa " +HIPFLAGS = --amdgpu-target=gfx90a diff --git a/systems/Crusher/dwf.slurm b/systems/Crusher/dwf.slurm index 7144a270..286615ef 100644 --- a/systems/Crusher/dwf.slurm +++ b/systems/Crusher/dwf.slurm @@ -3,24 +3,28 @@ #SBATCH -A LGT104 #SBATCH -t 01:00:00 ##SBATCH -U openmpThu -#SBATCH -p ecp +##SBATCH -p ecp #SBATCH -J DWF #SBATCH -o DWF.%J #SBATCH -e DWF.%J #SBATCH -N 1 #SBATCH -n 1 +#SBATCH --exclusive DIR=. module list -export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +#export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 export MPICH_GPU_SUPPORT_ENABLED=1 -#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +export MPICH_SMP_SINGLE_COPY_MODE=XPMEM #export MPICH_SMP_SINGLE_COPY_MODE=NONE -export MPICH_SMP_SINGLE_COPY_MODE=CMA -export OMP_NUM_THREADS=8 +#export MPICH_SMP_SINGLE_COPY_MODE=CMA +export OMP_NUM_THREADS=1 AT=8 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads ${AT} --grid 32.32.32.32 --mpi 1.1.1.1 --comms-overlap" -srun -n1 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS + +PARAMS=" --accelerator-threads ${AT} --grid 24.24.24.24 --shm-mpi 0 --mpi 1.1.1.1" + +srun --gpus-per-task 1 -n1 ./benchmarks/Benchmark_dwf_fp32 $PARAMS + diff --git a/systems/Crusher/dwf4.slurm b/systems/Crusher/dwf4.slurm index 9aebb480..6bb953c4 100644 --- a/systems/Crusher/dwf4.slurm +++ b/systems/Crusher/dwf4.slurm @@ -3,12 +3,12 @@ #SBATCH -A LGT104 #SBATCH -t 01:00:00 ##SBATCH -U openmpThu -#SBATCH -p ecp #SBATCH -J DWF #SBATCH -o DWF.%J #SBATCH -e DWF.%J #SBATCH -N 1 #SBATCH -n 4 +#SBATCH --exclusive DIR=. module list @@ -19,8 +19,9 @@ export MPICH_SMP_SINGLE_COPY_MODE=NONE #export MPICH_SMP_SINGLE_COPY_MODE=CMA export OMP_NUM_THREADS=4 -AT=8 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads ${AT} --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0" -srun -n4 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS +PARAMS=" --accelerator-threads 8 --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0" + +srun --gpus-per-task 1 -n4 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS + diff --git a/systems/Crusher/dwf8.slurm b/systems/Crusher/dwf8.slurm index 4bf22d16..58686636 100644 --- a/systems/Crusher/dwf8.slurm +++ b/systems/Crusher/dwf8.slurm @@ -1,14 +1,14 @@ #!/bin/bash # Begin LSF Directives #SBATCH -A LGT104 -#SBATCH -t 00:20:00 -#SBATCH -p ecp +#SBATCH -t 01:00:00 +##SBATCH -U openmpThu #SBATCH -J DWF #SBATCH -o DWF.%J #SBATCH -e DWF.%J #SBATCH -N 1 #SBATCH -n 8 - +#SBATCH --exclusive DIR=. module list @@ -19,9 +19,9 @@ export MPICH_SMP_SINGLE_COPY_MODE=NONE #export MPICH_SMP_SINGLE_COPY_MODE=CMA export OMP_NUM_THREADS=1 -AT=8 echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE -PARAMS=" --accelerator-threads ${AT} --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0" +PARAMS=" --accelerator-threads 8 --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0" + +srun --gpus-per-task 1 -n8 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS -srun -n8 -c 1 --gpus-per-task=1 --gpu-bind=closest --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS diff --git a/systems/Crusher/sourceme.sh b/systems/Crusher/sourceme.sh index 72a2ff4e..3f400ca4 100644 --- a/systems/Crusher/sourceme.sh +++ b/systems/Crusher/sourceme.sh @@ -2,4 +2,4 @@ module load PrgEnv-gnu module load rocm/4.5.0 module load gmp module load cray-fftw -module load craype-accel-amd-gfx908 +module load craype-accel-amd-gfx90a From 0c1618197f7fcc0c5828c1d3aa8a2dc4e915e62e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Feb 2022 08:52:07 -0500 Subject: [PATCH 28/30] Faster intranode MPI works now --- systems/Crusher/dwf8.slurm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/systems/Crusher/dwf8.slurm b/systems/Crusher/dwf8.slurm index 58686636..30e83fff 100644 --- a/systems/Crusher/dwf8.slurm +++ b/systems/Crusher/dwf8.slurm @@ -14,8 +14,8 @@ DIR=. module list export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 export MPICH_GPU_SUPPORT_ENABLED=1 -#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM -export MPICH_SMP_SINGLE_COPY_MODE=NONE +export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +#export MPICH_SMP_SINGLE_COPY_MODE=NONE #export MPICH_SMP_SINGLE_COPY_MODE=CMA export OMP_NUM_THREADS=1 From e8c187b3233aa6a5a02e4a1a31a3e83d629b5f2b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Feb 2022 11:24:38 -0500 Subject: [PATCH 29/30] SyCL happier? --- Grid/qcd/action/fermion/WilsonCloverHelpers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCloverHelpers.h b/Grid/qcd/action/fermion/WilsonCloverHelpers.h index 588525cc..60f19317 100644 --- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h +++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h @@ -726,8 +726,8 @@ public: static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) { conformable(f, m); auto grid = f.Grid(); - const int Nsite = grid->oSites(); - const int Nsimd = grid->Nsimd(); + const uint32_t Nsite = grid->oSites(); + const uint32_t Nsimd = grid->Nsimd(); autoView(f_v, f, AcceleratorWrite); autoView(m_v, m, AcceleratorRead); // NOTE: this function cannot be 'private' since nvcc forbids this for kernels From 63dbaeefaa533717379811f4695a149a82e2d6ec Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Feb 2022 14:01:43 +0000 Subject: [PATCH 30/30] Extra barrier prior to finalize just in case it fixes an issue on Tursa --- Grid/util/Init.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 6992129e..36854d9c 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -534,6 +534,7 @@ void Grid_init(int *argc,char ***argv) void Grid_finalize(void) { #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) + MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); Grid_unquiesce_nodes(); #endif