diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 47a0a92b..9eaf89f3 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB); #include #include #include +#include #include #include #include diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index e0fa7f83..3308d8fe 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -191,7 +191,7 @@ public: std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() < + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H +#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H + +NAMESPACE_BEGIN(Grid); + +//Mixed precision restarted defect correction CG +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class MixedPrecisionConjugateGradientBatched : public LinearFunction { +public: + using LinearFunction::operator(); + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + Integer MaxPatchupIterations; + GridBase* SinglePrecGrid; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + LinearOperatorBase &Linop_f; + LinearOperatorBase &Linop_d; + + //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess + LinearFunction *guesser; + bool updateResidual; + + MixedPrecisionConjugateGradientBatched(RealD tol, + Integer maxinnerit, + Integer maxouterit, + Integer maxpatchit, + GridBase* _sp_grid, + LinearOperatorBase &_Linop_f, + LinearOperatorBase &_Linop_d, + bool _updateResidual=true) : + Linop_f(_Linop_f), Linop_d(_Linop_d), + Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), + OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; + + void useGuesser(LinearFunction &g){ + guesser = &g; + } + + void operator() (const FieldD &src_d_in, FieldD &sol_d){ + std::vector srcs_d_in{src_d_in}; + std::vector sols_d{sol_d}; + + (*this)(srcs_d_in,sols_d); + + sol_d = sols_d[0]; + } + + void operator() (const std::vector &src_d_in, std::vector &sol_d){ + assert(src_d_in.size() == sol_d.size()); + int NBatch = src_d_in.size(); + + std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; + + Integer TotalOuterIterations = 0; //Number of restarts + std::vector TotalInnerIterations(NBatch,0); //Number of inner CG iterations + std::vector TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step + + GridStopWatch TotalTimer; + TotalTimer.Start(); + + GridStopWatch InnerCGtimer; + GridStopWatch PrecChangeTimer; + + int cb = src_d_in[0].Checkerboard(); + + std::vector src_norm; + std::vector norm; + std::vector stop; + + GridBase* DoublePrecGrid = src_d_in[0].Grid(); + FieldD tmp_d(DoublePrecGrid); + tmp_d.Checkerboard() = cb; + + FieldD tmp2_d(DoublePrecGrid); + tmp2_d.Checkerboard() = cb; + + std::vector src_d; + std::vector src_f; + std::vector sol_f; + + for (int i=0; i CG_f(inner_tol, MaxInnerIterations); + CG_f.ErrorOnNoConverge = false; + + Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count + + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; + + bool allConverged = true; + + for (int i=0; i OuterLoopNormMult * stop[i]) { + allConverged = false; + } + } + if (allConverged) break; + + if (updateResidual) { + RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); + RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); + while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? + CG_f.Tolerance = inner_tol; + } + + //Optionally improve inner solver guess (eg using known eigenvectors) + if(guesser != NULL) { + (*guesser)(src_f, sol_f); + } + + for (int i=0; i CG_d(Tolerance, MaxPatchupIterations); + CG_d(Linop_d, src_d_in[i], sol_d[i]); + TotalFinalStepIterations[i] += CG_d.IterationsToComplete; + } + + TotalTimer.Stop(); + + std::cout << GridLogMessage << std::endl; + for (int i=0; i=0) && (Nc < NallocCacheMax)) { + Ncache[CpuHuge]=Nc; + Ncache[AccHuge]=Nc; + Ncache[SharedHuge]=Nc; + } + } + str= getenv("GRID_ALLOC_NCACHE_SMALL"); if ( str ) { Nc = atoi(str); @@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) { std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); #else return ptr; @@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif + if (ncache == 0) return ptr; + void * ret = NULL; int v = -1; @@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries void *MemoryManager::Lookup(size_t bytes,int type) { #ifdef ALLOCATION_CACHE - bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); - int cache = type+small; + int cache; + if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; + else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); #else return NULL; @@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index c22a54f3..0dc78f04 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? #define GRID_ALLOC_SMALL_LIMIT (4096) +#define GRID_ALLOC_HUGE_LIMIT (2147483648) #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) @@ -70,6 +71,21 @@ enum ViewMode { CpuWriteDiscard = 0x10 // same for now }; +struct MemoryStatus { + uint64_t DeviceBytes; + uint64_t DeviceLRUBytes; + uint64_t DeviceMaxBytes; + uint64_t HostToDeviceBytes; + uint64_t DeviceToHostBytes; + uint64_t HostToDeviceXfer; + uint64_t DeviceToHostXfer; + uint64_t DeviceEvictions; + uint64_t DeviceDestroy; + uint64_t DeviceAllocCacheBytes; + uint64_t HostAllocCacheBytes; +}; + + class MemoryManager { private: @@ -83,7 +99,7 @@ private: } AllocationCacheEntry; static const int NallocCacheMax=128; - static const int NallocType=6; + static const int NallocType=9; static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; static int Victim[NallocType]; static int Ncache[NallocType]; @@ -121,7 +137,26 @@ private: static uint64_t DeviceToHostXfer; static uint64_t DeviceEvictions; static uint64_t DeviceDestroy; - + + static uint64_t DeviceCacheBytes(); + static uint64_t HostCacheBytes(); + + static MemoryStatus GetFootprint(void) { + MemoryStatus stat; + stat.DeviceBytes = DeviceBytes; + stat.DeviceLRUBytes = DeviceLRUBytes; + stat.DeviceMaxBytes = DeviceMaxBytes; + stat.HostToDeviceBytes = HostToDeviceBytes; + stat.DeviceToHostBytes = DeviceToHostBytes; + stat.HostToDeviceXfer = HostToDeviceXfer; + stat.DeviceToHostXfer = DeviceToHostXfer; + stat.DeviceEvictions = DeviceEvictions; + stat.DeviceDestroy = DeviceDestroy; + stat.DeviceAllocCacheBytes = DeviceCacheBytes(); + stat.HostAllocCacheBytes = HostCacheBytes(); + return stat; + }; + private: #ifndef GRID_UVM ////////////////////////////////////////////////////////////////////// diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 892e3dbe..e7d7a96d 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list,int dir) { - acceleratorCopySynchronise(); - StencilBarrier();// Synch shared memory on a single nodes - int nreq=list.size(); if (nreq==0) return; diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 9a273dc4..3a70395c 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -37,10 +37,11 @@ Author: Christoph Lehner #ifdef GRID_HIP #include #endif -#ifdef GRID_SYCl - +#ifdef GRID_SYCL +#define GRID_SYCL_LEVEL_ZERO_IPC #endif + NAMESPACE_BEGIN(Grid); #define header "SharedMemoryMpi: " /*Construct from an MPI communicator*/ diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index cf902b58..742c99da 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -297,6 +297,30 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA } } +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + +template +T iDivUp(T a, T b) // Round a / b to nearest higher integer value +{ return (a % b != 0) ? (a / b + 1) : (a / b); } + +template +__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) +{ + int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx >= e1*e2) return; + + int n, b, o; + + n = idx / e2; + b = idx % e2; + o = n*stride + b; + + vector[2*idx + 0] = lo + o; + vector[2*idx + 1] = ro + o; +} + +#endif + ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// @@ -321,12 +345,20 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int ent=0; if(cbmask == 0x3 ){ +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + ent = e1*e2; + dim3 blockSize(acceleratorThreads()); + dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); + populate_Cshift_table<<>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o,ro+o); } } +#endif } else { for(int n=0;n void Copy_plane_permute(Lattice& lhs,const Lattice>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o+b,ro+o+b); }} +#endif } else { for(int n=0;n -inline typename vobj::scalar_object sum(const Lattice &arg) +inline typename vobj::scalar_object rankSum(const Lattice &arg) { Integer osites = arg.Grid()->oSites(); #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) - typename vobj::scalar_object ssum; autoView( arg_v, arg, AcceleratorRead); - ssum= sum_gpu(&arg_v[0],osites); + return sum_gpu(&arg_v[0],osites); #else autoView(arg_v, arg, CpuRead); - auto ssum= sum_cpu(&arg_v[0],osites); + return sum_cpu(&arg_v[0],osites); #endif +} + +template +inline typename vobj::scalar_object sum(const Lattice &arg) +{ + auto ssum = rankSum(arg); arg.Grid()->GlobalSum(ssum); return ssum; } template -inline typename vobj::scalar_object sum_large(const Lattice &arg) +inline typename vobj::scalar_object rankSumLarge(const Lattice &arg) { #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_gpu_large(&arg_v[0],osites); + return sum_gpu_large(&arg_v[0],osites); #else autoView(arg_v, arg, CpuRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_cpu(&arg_v[0],osites); + return sum_cpu(&arg_v[0],osites); #endif +} + +template +inline typename vobj::scalar_object sum_large(const Lattice &arg) +{ + auto ssum = rankSumLarge(arg); arg.Grid()->GlobalSum(ssum); return ssum; } diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 4bdcce0b..ecf90d19 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -211,25 +211,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi assert(ok); Integer smemSize = numThreads * sizeof(sobj); - // UVM seems to be buggy under later CUDA drivers - // This fails on A100 and driver 5.30.02 / CUDA 12.1 - // Fails with multiple NVCC versions back to 11.4, - // which worked with earlier drivers. - // Not sure which driver had first fail and this bears checking - // Is awkward as must install multiple driver versions + // Move out of UVM + // Turns out I had messed up the synchronise after move to compute stream + // as running this on the default stream fools the synchronise #undef UVM_BLOCK_BUFFER #ifndef UVM_BLOCK_BUFFER commVector buffer(numBlocks); sobj *buffer_v = &buffer[0]; sobj result; - reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); + reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); accelerator_barrier(); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); #else Vector buffer(numBlocks); sobj *buffer_v = &buffer[0]; sobj result; - reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); + reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); accelerator_barrier(); result = *buffer_v; #endif diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index 180b8437..b7ef0e82 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -440,17 +440,8 @@ public: _grid->GlobalCoorToGlobalIndex(gcoor,gidx); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); -#if 1 - assert(rank == _grid->ThisRank() ); -#else -// - if (rank != _grid->ThisRank() ){ - std::cout <<"rank "<ThisRank() "<<_grid->ThisRank()<< std::endl; -// exit(-42); -// assert(0); - } -#endif + assert(rank == _grid->ThisRank() ); int l_idx=generator_idx(o_idx,i_idx); _generators[l_idx] = master_engine; diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 6f9fc480..895e2ce7 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -288,7 +288,36 @@ inline void blockProject(Lattice > &coarseData, blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); } } +template +inline void batchBlockProject(std::vector>> &coarseData, + const std::vector> &fineData, + const VLattice &Basis) +{ + int NBatch = fineData.size(); + assert(coarseData.size() == NBatch); + GridBase * fine = fineData[0].Grid(); + GridBase * coarse= coarseData[0].Grid(); + + Lattice> ip(coarse); + std::vector> fineDataCopy = fineData; + + autoView(ip_, ip, AcceleratorWrite); + for(int v=0;v + accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { + convertType(coarseData_[sc](v),ip_[sc]); + }); + + // improve numerical stability of projection + // |fine> = |fine> - |basis> + ip=-ip; + blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); + } + } +} template inline void blockZAXPY(Lattice &fineZ, @@ -590,6 +619,26 @@ inline void blockPromote(const Lattice > &coarseData, } #endif +template +inline void batchBlockPromote(const std::vector>> &coarseData, + std::vector> &fineData, + const VLattice &Basis) +{ + int NBatch = coarseData.size(); + assert(fineData.size() == NBatch); + + GridBase * fine = fineData[0].Grid(); + GridBase * coarse = coarseData[0].Grid(); + for (int k=0; k> ip = PeekIndex<0>(coarseData[k],i); + blockZAXPY(fineData[k],ip,Basis[i],fineData[k]); + } + } +} + // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Simd layouts need not match since we use peek/poke Local template diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index fcf1f1f3..70e2477f 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -463,11 +463,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} -#ifdef SYCL_HACK - if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; } -#else if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} -#endif #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif @@ -478,6 +474,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} #ifndef GRID_CUDA @@ -502,10 +499,9 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif - acceleratorFenceComputeStream(); } else if( interior ) { - if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} - if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} + if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;} + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif @@ -516,7 +512,6 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif - acceleratorFenceComputeStream(); } assert(0 && " Kernel optimisation case not covered "); } diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/CayleyFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/ContinuedFractionFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/DomainWallEOFAFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/MobiusEOFAFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/PartialFractionFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 9cc05107..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonCloverFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermion5DInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index 5f6ab65e..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonKernelsInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc deleted file mode 120000 index d5789bcf..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/WilsonTMFermionInstantiationWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonTMFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h b/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h deleted file mode 100644 index a836ff03..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD2/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION WilsonImplD2 diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/CayleyFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/ContinuedFractionFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/DomainWallEOFAFermionInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/MobiusEOFAFermionInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/PartialFractionFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonFermion5DInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/WilsonKernelsInstantiationZWilsonImplD2.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h deleted file mode 100644 index 067d6080..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD2/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION ZWilsonImplD2 diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h index 476b1c53..c0e2c1d3 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h @@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid); // NumOp == V // DenOp == M // - AUDIT(); FermionField etaOdd (NumOp.FermionRedBlackGrid()); FermionField etaEven(NumOp.FermionRedBlackGrid()); FermionField tmp (NumOp.FermionRedBlackGrid()); - AUDIT(); pickCheckerboard(Even,etaEven,eta); - AUDIT(); pickCheckerboard(Odd,etaOdd,eta); - AUDIT(); NumOp.ImportGauge(U); - AUDIT(); DenOp.ImportGauge(U); std::cout << " TwoFlavourRefresh: Imported gauge "< Mpc(DenOp); - AUDIT(); SchurDifferentiableOperator Vpc(NumOp); - AUDIT(); std::cout << " TwoFlavourRefresh: Diff ops "<deriv_timer_start(); as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta as[level].actions.at(a)->deriv_timer_stop(); std::cout << GridLogMessage << "AuditForce["<is_smeared << std::endl; auto name = as[level].actions.at(a)->action_name(); @@ -382,12 +380,12 @@ public: Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); std::cout << GridLogMessage << "AuditRefresh["<refresh_timer_start(); as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); as[level].actions.at(actionID)->refresh_timer_stop(); std::cout << GridLogMessage << "AuditRefresh["<is_smeared); @@ -434,7 +432,7 @@ public: as[level].actions.at(actionID)->S_timer_stop(); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; - AUDIT(); + } as[level].apply(S_hireps, Representations, level, H); } @@ -447,9 +445,9 @@ public: void operator()(std::vector*> repr_set, Repr& Rep, int level, RealD& H) { for (int a = 0; a < repr_set.size(); ++a) { - AUDIT(); + RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); - AUDIT(); + std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; H += Hterm; @@ -474,10 +472,10 @@ public: Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; as[level].actions.at(actionID)->S_timer_start(); - AUDIT(); + Hterm = as[level].actions.at(actionID)->Sinitial(Us); as[level].actions.at(actionID)->S_timer_stop(); - AUDIT(); + std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; H += Hterm; } @@ -490,7 +488,6 @@ public: void integrate(Field& U) { - AUDIT(); // reset the clocks t_U = 0; for (int level = 0; level < as.size(); ++level) { @@ -508,10 +505,8 @@ public: assert(fabs(t_U - t_P[level]) < 1.0e-6); // must be the same std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; } - AUDIT(); FieldImplementation::Project(U); - AUDIT(); // and that we indeed got to the end of the trajectory assert(fabs(t_U - Params.trajL) < 1.0e-6); diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index f052cb25..1568cbf9 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -434,7 +434,6 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { - accelerator_barrier(); for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -443,7 +442,6 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } - _grid->StencilBarrier();// Synch shared memory on a single nodes } void CommunicateComplete(std::vector > &reqs) @@ -452,6 +450,9 @@ public: if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); + acceleratorCopySynchronise(); + // Everyone agrees we are all done + _grid->StencilBarrier(); } //////////////////////////////////////////////////////////////////////// // Blocking send and receive. Either sequential or parallel. @@ -529,7 +530,6 @@ public: { _grid->StencilBarrier();// Synch shared memory on a single nodes - // conformable(source.Grid(),_grid); assert(source.Grid()==_grid); u_comm_offset=0; @@ -655,8 +655,8 @@ public: CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { - _grid->StencilBarrier();// Synch shared memory on a single nodes - CommsMerge(decompress,MergersSHM,DecompressionsSHM); + assert(MergersSHM.size()==0); + assert(DecompressionsSHM.size()==0); } template @@ -665,9 +665,11 @@ public: for(int i=0;i + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + int threads = GridThread::GetThreads(); + + Coordinate latt4 = GridDefaultLatt(); + int Ls=16; + for(int i=0;i> Ls; + } + + GridLogLayout(); + + long unsigned int single_site_flops = 8*Nc*(7+16*Nc); + + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionF src (FGrid); random(RNG5,src); + LatticeFermionF src1 (FGrid); random(RNG5,src1); +#if 0 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4,Umu); + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; +#if 0 + Umu=1.0; + for(int mu=0;mu(Umu,mu); + // if (mu !=2 ) ttmp = 0; + // ttmp = ttmp* pow(10.0,mu); + PokeIndex(Umu,ttmp,mu); + } + std::cout << GridLogMessage << "Forced to diagonal " << std::endl; +#endif + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + // replicate across fifth dimension + // LatticeGaugeFieldF Umu5d(FGrid); + std::vector U(4,UGrid); + for(int mu=0;mu(Umu,mu); + } + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; + + if (1) + { + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); + Dw.Dhop(src,result,0); + std::cout<Barrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + + /* + std::cout << "RESULT\n " << result<Barrier(); + exit(-1); + } + assert (norm2(err)< 1.0e-4 ); + } + + if (1) + { // Naive wilson dag implementation + ref = Zero(); + for(int mu=0;muoSites();ss++){ + for(int s=0;soSites();ss++){ + for(int s=0;s1.0e-4)){ +/* + std::cout<< "DAG RESULT\n " <Barrier(); + Dw.DhopEO(src_o,r_e,DaggerNo); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4)){ + /* + std::cout<< "Deo RESULT\n " < 1tile.log +for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 +do +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i +done mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh index 0e48625b..b8806b30 100755 --- a/systems/PVC/benchmarks/wrap.sh +++ b/systems/PVC/benchmarks/wrap.sh @@ -5,10 +5,5 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK -#if [ $MPI_LOCALRANKID = "0" ] -#then -# ~psteinbr/build_pti/ze_tracer -c $@ -# onetrace --chrome-kernel-timeline $@ -#else $@ -#fi +