mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-30 03:24:33 +00:00 
			
		
		
		
	Compare commits
	
		
			36 Commits
		
	
	
		
			feature/bl
			...
			feature/di
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 6b979f0a69 | ||
|  | fc4db5e963 | ||
|  | 6252ffaf76 | ||
|  | 58e020b62a | ||
|  | a7e1aceeca | ||
|  | 7212432f43 | ||
|  | 4a261fab30 | ||
|  | 6af97069b9 | ||
|  | 5068413cdb | ||
|  | 71c6960eea | ||
|  | ddf6d5c9e3 | ||
|  | 900e01f49b | ||
|  | 2376156fbc | ||
|  | 3f2fd49db4 | ||
|  | 0efa107cb6 | ||
|  | 8feedb4f6f | ||
|  | 05e562e3d7 | ||
|  | dd3bbb8fa2 | ||
|  | 2fbcf13c46 | ||
|  | 4ea48ef0c4 | ||
|  | 5c85774ee3 | ||
|  | d8a9a745d8 | ||
|  | dcf172da3b | ||
|  | 546be724e7 | ||
|  | 481bbaf1fc | ||
|  | 281488611a | ||
|  | bae0f8ea99 | ||
|  | bbbcd36ae5 | ||
| a3e935c902 | |||
| 7731c7db8e | |||
| ff97340324 | |||
| 920a51438d | |||
| be528b6d27 | |||
|  | 7d62f1d6d2 | ||
|  | 458c943987 | ||
|  | 88015b0858 | 
| @@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB); | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h> | ||||
| #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> | ||||
| #include <Grid/algorithms/iterative/BlockConjugateGradient.h> | ||||
| #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> | ||||
|   | ||||
| @@ -191,7 +191,7 @@ public: | ||||
| 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl; | ||||
| 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; | ||||
|  | ||||
| 	std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; | ||||
| 	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; | ||||
|  | ||||
|         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||
|  | ||||
|   | ||||
							
								
								
									
										213
									
								
								Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										213
									
								
								Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,213 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
|     Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
| #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H | ||||
| #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| //Mixed precision restarted defect correction CG | ||||
| template<class FieldD,class FieldF,  | ||||
|   typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, | ||||
|   typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>  | ||||
| class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> { | ||||
| public: | ||||
|   using LinearFunction<FieldD>::operator(); | ||||
|   RealD   Tolerance; | ||||
|   RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed | ||||
|   Integer MaxInnerIterations; | ||||
|   Integer MaxOuterIterations; | ||||
|   Integer MaxPatchupIterations; | ||||
|   GridBase* SinglePrecGrid; //Grid for single-precision fields | ||||
|   RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance | ||||
|   LinearOperatorBase<FieldF> &Linop_f; | ||||
|   LinearOperatorBase<FieldD> &Linop_d; | ||||
|  | ||||
|   //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess | ||||
|   LinearFunction<FieldF> *guesser; | ||||
|   bool updateResidual; | ||||
|    | ||||
|   MixedPrecisionConjugateGradientBatched(RealD tol,  | ||||
|           Integer maxinnerit,  | ||||
|           Integer maxouterit,  | ||||
|           Integer maxpatchit, | ||||
|           GridBase* _sp_grid,  | ||||
|           LinearOperatorBase<FieldF> &_Linop_f,  | ||||
|           LinearOperatorBase<FieldD> &_Linop_d, | ||||
|           bool _updateResidual=true) : | ||||
|     Linop_f(_Linop_f), Linop_d(_Linop_d), | ||||
|     Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), | ||||
|     OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; | ||||
|  | ||||
|   void useGuesser(LinearFunction<FieldF> &g){ | ||||
|     guesser = &g; | ||||
|   } | ||||
|    | ||||
|   void operator() (const FieldD &src_d_in, FieldD &sol_d){ | ||||
|     std::vector<FieldD> srcs_d_in{src_d_in}; | ||||
|     std::vector<FieldD> sols_d{sol_d}; | ||||
|  | ||||
|     (*this)(srcs_d_in,sols_d); | ||||
|  | ||||
|     sol_d = sols_d[0]; | ||||
|   } | ||||
|  | ||||
|   void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){ | ||||
|     assert(src_d_in.size() == sol_d.size()); | ||||
|     int NBatch = src_d_in.size(); | ||||
|  | ||||
|     std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; | ||||
|  | ||||
|     Integer TotalOuterIterations = 0; //Number of restarts | ||||
|     std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations | ||||
|     std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step | ||||
|    | ||||
|     GridStopWatch TotalTimer; | ||||
|     TotalTimer.Start(); | ||||
|  | ||||
|     GridStopWatch InnerCGtimer; | ||||
|     GridStopWatch PrecChangeTimer; | ||||
|      | ||||
|     int cb = src_d_in[0].Checkerboard(); | ||||
|      | ||||
|     std::vector<RealD> src_norm; | ||||
|     std::vector<RealD> norm; | ||||
|     std::vector<RealD> stop; | ||||
|      | ||||
|     GridBase* DoublePrecGrid = src_d_in[0].Grid(); | ||||
|     FieldD tmp_d(DoublePrecGrid); | ||||
|     tmp_d.Checkerboard() = cb; | ||||
|      | ||||
|     FieldD tmp2_d(DoublePrecGrid); | ||||
|     tmp2_d.Checkerboard() = cb; | ||||
|  | ||||
|     std::vector<FieldD> src_d; | ||||
|     std::vector<FieldF> src_f; | ||||
|     std::vector<FieldF> sol_f; | ||||
|  | ||||
|     for (int i=0; i<NBatch; i++) { | ||||
|       sol_d[i].Checkerboard() = cb; | ||||
|  | ||||
|       src_norm.push_back(norm2(src_d_in[i])); | ||||
|       norm.push_back(0.); | ||||
|       stop.push_back(src_norm[i] * Tolerance*Tolerance); | ||||
|  | ||||
|       src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation | ||||
|  | ||||
|       src_f.push_back(SinglePrecGrid); | ||||
|       src_f[i].Checkerboard() = cb; | ||||
|  | ||||
|       sol_f.push_back(SinglePrecGrid); | ||||
|       sol_f[i].Checkerboard() = cb; | ||||
|     } | ||||
|      | ||||
|     RealD inner_tol = InnerTolerance; | ||||
|      | ||||
|     ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||
|     CG_f.ErrorOnNoConverge = false; | ||||
|      | ||||
|     Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count | ||||
|        | ||||
|     for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ | ||||
|       std::cout << GridLogMessage << std::endl; | ||||
|       std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; | ||||
|        | ||||
|       bool allConverged = true; | ||||
|        | ||||
|       for (int i=0; i<NBatch; i++) { | ||||
|         //Compute double precision rsd and also new RHS vector. | ||||
|         Linop_d.HermOp(sol_d[i], tmp_d); | ||||
|         norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector | ||||
|          | ||||
|         std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl; | ||||
|  | ||||
|         PrecChangeTimer.Start(); | ||||
|         precisionChange(src_f[i], src_d[i]); | ||||
|         PrecChangeTimer.Stop(); | ||||
|          | ||||
|         sol_f[i] = Zero(); | ||||
|        | ||||
|         if(norm[i] > OuterLoopNormMult * stop[i]) { | ||||
|           allConverged = false; | ||||
|         } | ||||
|       } | ||||
|       if (allConverged) break; | ||||
|  | ||||
|       if (updateResidual) { | ||||
|         RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); | ||||
|         RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); | ||||
|         while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ?? | ||||
|         CG_f.Tolerance = inner_tol; | ||||
|       } | ||||
|  | ||||
|       //Optionally improve inner solver guess (eg using known eigenvectors) | ||||
|       if(guesser != NULL) { | ||||
|         (*guesser)(src_f, sol_f); | ||||
|       } | ||||
|  | ||||
|       for (int i=0; i<NBatch; i++) { | ||||
|         //Inner CG | ||||
|         InnerCGtimer.Start(); | ||||
|         CG_f(Linop_f, src_f[i], sol_f[i]); | ||||
|         InnerCGtimer.Stop(); | ||||
|         TotalInnerIterations[i] += CG_f.IterationsToComplete; | ||||
|          | ||||
|         //Convert sol back to double and add to double prec solution | ||||
|         PrecChangeTimer.Start(); | ||||
|         precisionChange(tmp_d, sol_f[i]); | ||||
|         PrecChangeTimer.Stop(); | ||||
|          | ||||
|         axpy(sol_d[i], 1.0, tmp_d, sol_d[i]); | ||||
|       } | ||||
|  | ||||
|     } | ||||
|      | ||||
|     //Final trial CG | ||||
|     std::cout << GridLogMessage << std::endl; | ||||
|     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl; | ||||
|      | ||||
|     for (int i=0; i<NBatch; i++) { | ||||
|       ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations); | ||||
|       CG_d(Linop_d, src_d_in[i], sol_d[i]); | ||||
|       TotalFinalStepIterations[i] += CG_d.IterationsToComplete; | ||||
|     } | ||||
|  | ||||
|     TotalTimer.Stop(); | ||||
|  | ||||
|     std::cout << GridLogMessage << std::endl; | ||||
|     for (int i=0; i<NBatch; i++) { | ||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl; | ||||
|     } | ||||
|     std::cout << GridLogMessage << std::endl; | ||||
|     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||
|      | ||||
|   } | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| #endif | ||||
| @@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| /*Allocation types, saying which pointer cache should be used*/ | ||||
| #define Cpu      (0) | ||||
| #define CpuSmall (1) | ||||
| #define Acc      (2) | ||||
| #define AccSmall (3) | ||||
| #define Shared   (4) | ||||
| #define SharedSmall (5) | ||||
| #define CpuHuge  (1) | ||||
| #define CpuSmall (2) | ||||
| #define Acc      (3) | ||||
| #define AccHuge  (4) | ||||
| #define AccSmall (5) | ||||
| #define Shared   (6) | ||||
| #define SharedHuge  (7) | ||||
| #define SharedSmall (8) | ||||
| #undef GRID_MM_VERBOSE  | ||||
| uint64_t total_shared; | ||||
| uint64_t total_device; | ||||
| @@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void) | ||||
|    | ||||
| } | ||||
|  | ||||
| uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } | ||||
| uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| // Data tables for recently freed pooiniter caches | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; | ||||
| int MemoryManager::Victim[MemoryManager::NallocType]; | ||||
| int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; | ||||
| int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 }; | ||||
| uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; | ||||
| ////////////////////////////////////////////////////////////////////// | ||||
| // Actual allocation and deallocation utils | ||||
| @@ -170,6 +176,16 @@ void MemoryManager::Init(void) | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   str= getenv("GRID_ALLOC_NCACHE_HUGE"); | ||||
|   if ( str ) { | ||||
|     Nc = atoi(str); | ||||
|     if ( (Nc>=0) && (Nc < NallocCacheMax)) { | ||||
|       Ncache[CpuHuge]=Nc; | ||||
|       Ncache[AccHuge]=Nc; | ||||
|       Ncache[SharedHuge]=Nc; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   str= getenv("GRID_ALLOC_NCACHE_SMALL"); | ||||
|   if ( str ) { | ||||
|     Nc = atoi(str); | ||||
| @@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) { | ||||
|    | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; | ||||
| #ifdef ALLOCATION_CACHE | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl; | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl; | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl; | ||||
|   std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl; | ||||
| #endif | ||||
|    | ||||
| #ifdef GRID_UVM | ||||
| @@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) { | ||||
| void *MemoryManager::Insert(void *ptr,size_t bytes,int type)  | ||||
| { | ||||
| #ifdef ALLOCATION_CACHE | ||||
|   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); | ||||
|   int cache = type + small; | ||||
|   int cache; | ||||
|   if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; | ||||
|   else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; | ||||
|   else                                     cache = type; | ||||
|  | ||||
|   return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);   | ||||
| #else | ||||
|   return ptr; | ||||
| @@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type) | ||||
|  | ||||
| void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)  | ||||
| { | ||||
|   assert(ncache>0); | ||||
| #ifdef GRID_OMP | ||||
|   assert(omp_in_parallel()==0); | ||||
| #endif  | ||||
|  | ||||
|   if (ncache == 0) return ptr; | ||||
|  | ||||
|   void * ret = NULL; | ||||
|   int v = -1; | ||||
|  | ||||
| @@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries | ||||
| void *MemoryManager::Lookup(size_t bytes,int type) | ||||
| { | ||||
| #ifdef ALLOCATION_CACHE | ||||
|   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); | ||||
|   int cache = type+small; | ||||
|   int cache; | ||||
|   if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; | ||||
|   else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; | ||||
|   else                                     cache = type; | ||||
|  | ||||
|   return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); | ||||
| #else | ||||
|   return NULL; | ||||
| @@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type) | ||||
|  | ||||
| void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)  | ||||
| { | ||||
|   assert(ncache>0); | ||||
| #ifdef GRID_OMP | ||||
|   assert(omp_in_parallel()==0); | ||||
| #endif  | ||||
|   | ||||
| @@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid); | ||||
| // Move control to configure.ac and Config.h? | ||||
|  | ||||
| #define GRID_ALLOC_SMALL_LIMIT (4096) | ||||
| #define GRID_ALLOC_HUGE_LIMIT  (2147483648) | ||||
|  | ||||
| #define STRINGIFY(x) #x | ||||
| #define TOSTRING(x) STRINGIFY(x) | ||||
| @@ -70,6 +71,21 @@ enum ViewMode { | ||||
|   CpuWriteDiscard = 0x10 // same for now | ||||
| }; | ||||
|  | ||||
| struct MemoryStatus { | ||||
|   uint64_t     DeviceBytes; | ||||
|   uint64_t     DeviceLRUBytes; | ||||
|   uint64_t     DeviceMaxBytes; | ||||
|   uint64_t     HostToDeviceBytes; | ||||
|   uint64_t     DeviceToHostBytes; | ||||
|   uint64_t     HostToDeviceXfer; | ||||
|   uint64_t     DeviceToHostXfer; | ||||
|   uint64_t     DeviceEvictions; | ||||
|   uint64_t     DeviceDestroy; | ||||
|   uint64_t     DeviceAllocCacheBytes; | ||||
|   uint64_t     HostAllocCacheBytes; | ||||
| }; | ||||
|  | ||||
|  | ||||
| class MemoryManager { | ||||
| private: | ||||
|  | ||||
| @@ -83,7 +99,7 @@ private: | ||||
|   } AllocationCacheEntry; | ||||
|  | ||||
|   static const int NallocCacheMax=128;  | ||||
|   static const int NallocType=6; | ||||
|   static const int NallocType=9; | ||||
|   static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; | ||||
|   static int Victim[NallocType]; | ||||
|   static int Ncache[NallocType]; | ||||
| @@ -122,6 +138,25 @@ private: | ||||
|   static uint64_t     DeviceEvictions; | ||||
|   static uint64_t     DeviceDestroy; | ||||
|    | ||||
|   static uint64_t     DeviceCacheBytes(); | ||||
|   static uint64_t     HostCacheBytes(); | ||||
|  | ||||
|   static MemoryStatus GetFootprint(void) { | ||||
|     MemoryStatus stat; | ||||
|     stat.DeviceBytes       = DeviceBytes; | ||||
|     stat.DeviceLRUBytes    = DeviceLRUBytes; | ||||
|     stat.DeviceMaxBytes    = DeviceMaxBytes; | ||||
|     stat.HostToDeviceBytes = HostToDeviceBytes; | ||||
|     stat.DeviceToHostBytes = DeviceToHostBytes; | ||||
|     stat.HostToDeviceXfer  = HostToDeviceXfer; | ||||
|     stat.DeviceToHostXfer  = DeviceToHostXfer; | ||||
|     stat.DeviceEvictions   = DeviceEvictions; | ||||
|     stat.DeviceDestroy     = DeviceDestroy; | ||||
|     stat.DeviceAllocCacheBytes = DeviceCacheBytes(); | ||||
|     stat.HostAllocCacheBytes   = HostCacheBytes(); | ||||
|     return stat; | ||||
|   }; | ||||
|    | ||||
|  private: | ||||
| #ifndef GRID_UVM | ||||
|   ////////////////////////////////////////////////////////////////////// | ||||
|   | ||||
| @@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | ||||
| } | ||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) | ||||
| { | ||||
|   acceleratorCopySynchronise(); | ||||
|   StencilBarrier();// Synch shared memory on a single nodes | ||||
|  | ||||
|   int nreq=list.size(); | ||||
|  | ||||
|   if (nreq==0) return; | ||||
|   | ||||
| @@ -37,10 +37,11 @@ Author: Christoph Lehner <christoph@lhnr.de> | ||||
| #ifdef GRID_HIP | ||||
| #include <hip/hip_runtime_api.h> | ||||
| #endif | ||||
| #ifdef GRID_SYCl | ||||
|  | ||||
| #ifdef GRID_SYCL | ||||
| #define GRID_SYCL_LEVEL_ZERO_IPC | ||||
| #endif | ||||
|  | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid);  | ||||
| #define header "SharedMemoryMpi: " | ||||
| /*Construct from an MPI communicator*/ | ||||
|   | ||||
| @@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA | ||||
|   } | ||||
| } | ||||
|  | ||||
| #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) | ||||
|  | ||||
| template <typename T> | ||||
| T iDivUp(T a, T b) // Round a / b to nearest higher integer value | ||||
| { return (a % b != 0) ? (a / b + 1) : (a / b); } | ||||
|  | ||||
| template <typename T> | ||||
| __global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) | ||||
| { | ||||
|     int idx = blockIdx.x*blockDim.x + threadIdx.x; | ||||
|     if (idx >= e1*e2) return; | ||||
|  | ||||
|     int n, b, o; | ||||
|  | ||||
|     n = idx / e2; | ||||
|     b = idx % e2; | ||||
|     o = n*stride + b; | ||||
|  | ||||
|     vector[2*idx + 0] = lo + o; | ||||
|     vector[2*idx + 1] = ro + o; | ||||
| } | ||||
|  | ||||
| #endif | ||||
|  | ||||
| ////////////////////////////////////////////////////// | ||||
| // local to node block strided copies | ||||
| ////////////////////////////////////////////////////// | ||||
| @@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | ||||
|   int ent=0; | ||||
|  | ||||
|   if(cbmask == 0x3 ){ | ||||
| #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) | ||||
|     ent = e1*e2; | ||||
|     dim3 blockSize(acceleratorThreads()); | ||||
|     dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); | ||||
|     populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); | ||||
|     accelerator_barrier(); | ||||
| #else | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
|         int o =n*stride+b; | ||||
| 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); | ||||
|       } | ||||
|     } | ||||
| #endif | ||||
|   } else {  | ||||
|     for(int n=0;n<e1;n++){ | ||||
|       for(int b=0;b<e2;b++){ | ||||
| @@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo | ||||
|   int ent=0; | ||||
|  | ||||
|   if ( cbmask == 0x3 ) { | ||||
| #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) | ||||
|     ent = e1*e2; | ||||
|     dim3 blockSize(acceleratorThreads()); | ||||
|     dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); | ||||
|     populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); | ||||
|     accelerator_barrier(); | ||||
| #else | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     for(int b=0;b<e2;b++){ | ||||
|       int o  =n*stride; | ||||
|       Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); | ||||
|     }} | ||||
| #endif | ||||
|   } else { | ||||
|     for(int n=0;n<e1;n++){ | ||||
|     for(int b=0;b<e2;b++){ | ||||
|   | ||||
| @@ -153,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
| inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg) | ||||
| { | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||
|   typename vobj::scalar_object ssum; | ||||
|   autoView( arg_v, arg, AcceleratorRead); | ||||
|   ssum= sum_gpu(&arg_v[0],osites); | ||||
|   return sum_gpu(&arg_v[0],osites); | ||||
| #else | ||||
|   autoView(arg_v, arg, CpuRead); | ||||
|   auto ssum= sum_cpu(&arg_v[0],osites); | ||||
|   return sum_cpu(&arg_v[0],osites); | ||||
| #endif   | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
| { | ||||
|   auto ssum = rankSum(arg); | ||||
|   arg.Grid()->GlobalSum(ssum); | ||||
|   return ssum; | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) | ||||
| inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg) | ||||
| { | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||
|   autoView( arg_v, arg, AcceleratorRead); | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
|   auto ssum= sum_gpu_large(&arg_v[0],osites); | ||||
|   return sum_gpu_large(&arg_v[0],osites); | ||||
| #else | ||||
|   autoView(arg_v, arg, CpuRead); | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
|   auto ssum= sum_cpu(&arg_v[0],osites); | ||||
|   return sum_cpu(&arg_v[0],osites); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) | ||||
| { | ||||
|   auto ssum = rankSumLarge(arg); | ||||
|   arg.Grid()->GlobalSum(ssum); | ||||
|   return ssum; | ||||
| } | ||||
|   | ||||
| @@ -211,25 +211,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi | ||||
|   assert(ok); | ||||
|  | ||||
|   Integer smemSize = numThreads * sizeof(sobj); | ||||
|   // UVM seems to be buggy under later CUDA drivers | ||||
|   // This fails on A100 and driver 5.30.02 / CUDA 12.1 | ||||
|   // Fails with multiple NVCC versions back to 11.4, | ||||
|   // which worked with earlier drivers. | ||||
|   // Not sure which driver had first fail and this bears checking | ||||
|   // Is awkward as must install multiple driver versions | ||||
|   // Move out of UVM | ||||
|   // Turns out I had messed up the synchronise after move to compute stream | ||||
|   // as running this on the default stream fools the synchronise | ||||
| #undef UVM_BLOCK_BUFFER   | ||||
| #ifndef UVM_BLOCK_BUFFER   | ||||
|   commVector<sobj> buffer(numBlocks); | ||||
|   sobj *buffer_v = &buffer[0]; | ||||
|   sobj result; | ||||
|   reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); | ||||
|   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); | ||||
|   accelerator_barrier(); | ||||
|   acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); | ||||
| #else | ||||
|   Vector<sobj> buffer(numBlocks); | ||||
|   sobj *buffer_v = &buffer[0]; | ||||
|   sobj result; | ||||
|   reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); | ||||
|   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); | ||||
|   accelerator_barrier(); | ||||
|   result = *buffer_v; | ||||
| #endif | ||||
|   | ||||
| @@ -440,17 +440,8 @@ public: | ||||
| 	_grid->GlobalCoorToGlobalIndex(gcoor,gidx); | ||||
|  | ||||
| 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||
| #if 1 | ||||
| 	assert(rank == _grid->ThisRank() ); | ||||
| #else | ||||
| //  | ||||
| 	if (rank != _grid->ThisRank() ){ | ||||
| 	std::cout <<"rank "<<rank<<" _grid->ThisRank() "<<_grid->ThisRank()<< std::endl; | ||||
| //	exit(-42); | ||||
| //	assert(0); | ||||
| 	} | ||||
| #endif | ||||
|  | ||||
| 	assert(rank == _grid->ThisRank() ); | ||||
| 	 | ||||
| 	int l_idx=generator_idx(o_idx,i_idx); | ||||
| 	_generators[l_idx] = master_engine; | ||||
|   | ||||
| @@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData, | ||||
|     blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);  | ||||
|   } | ||||
| } | ||||
| template<class vobj,class CComplex,int nbasis,class VLattice> | ||||
| inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData, | ||||
|                                const std::vector<Lattice<vobj>> &fineData, | ||||
|                                const VLattice &Basis) | ||||
| { | ||||
|   int NBatch = fineData.size(); | ||||
|   assert(coarseData.size() == NBatch); | ||||
|  | ||||
|   GridBase * fine  = fineData[0].Grid(); | ||||
|   GridBase * coarse= coarseData[0].Grid(); | ||||
|  | ||||
|   Lattice<iScalar<CComplex>> ip(coarse); | ||||
|   std::vector<Lattice<vobj>> fineDataCopy = fineData; | ||||
|  | ||||
|   autoView(ip_, ip, AcceleratorWrite); | ||||
|   for(int v=0;v<nbasis;v++) { | ||||
|     for (int k=0; k<NBatch; k++) { | ||||
|       autoView( coarseData_ , coarseData[k], AcceleratorWrite); | ||||
|       blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine> | ||||
|       accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { | ||||
|         convertType(coarseData_[sc](v),ip_[sc]); | ||||
|       }); | ||||
|  | ||||
|       // improve numerical stability of projection | ||||
|       // |fine> = |fine> - <basis|fine> |basis> | ||||
|       ip=-ip; | ||||
|       blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);  | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| template<class vobj,class vobj2,class CComplex> | ||||
|   inline void blockZAXPY(Lattice<vobj> &fineZ, | ||||
| @@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, | ||||
| } | ||||
| #endif | ||||
|  | ||||
| template<class vobj,class CComplex,int nbasis,class VLattice> | ||||
| inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData, | ||||
|                                std::vector<Lattice<vobj>> &fineData, | ||||
|                                const VLattice &Basis) | ||||
| { | ||||
|   int NBatch = coarseData.size(); | ||||
|   assert(fineData.size() == NBatch); | ||||
|  | ||||
|   GridBase * fine   = fineData[0].Grid(); | ||||
|   GridBase * coarse = coarseData[0].Grid(); | ||||
|   for (int k=0; k<NBatch; k++) | ||||
|     fineData[k]=Zero(); | ||||
|   for (int i=0;i<nbasis;i++) { | ||||
|     for (int k=0; k<NBatch; k++) { | ||||
|       Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i); | ||||
|       blockZAXPY(fineData[k],ip,Basis[i],fineData[k]); | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. | ||||
| // Simd layouts need not match since we use peek/poke Local | ||||
| template<class vobj,class vvobj> | ||||
|   | ||||
| @@ -463,11 +463,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
|  | ||||
|    if( interior && exterior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} | ||||
| #ifdef SYCL_HACK      | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl);    return; } | ||||
| #else | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} | ||||
| #endif      | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;} | ||||
| #endif | ||||
| @@ -478,6 +474,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;} | ||||
| #endif | ||||
|    } else if( exterior ) { | ||||
|      acceleratorFenceComputeStream(); | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;} | ||||
| #ifndef GRID_CUDA | ||||
| @@ -502,10 +499,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;} | ||||
| #endif | ||||
|      acceleratorFenceComputeStream(); | ||||
|    } else if( interior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;} | ||||
| #endif | ||||
| @@ -516,7 +512,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;} | ||||
| #endif | ||||
|      acceleratorFenceComputeStream(); | ||||
|    } | ||||
|    assert(0 && " Kernel optimisation case not covered "); | ||||
|   } | ||||
|   | ||||
| @@ -1 +0,0 @@ | ||||
| ../CayleyFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../ContinuedFractionFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../DomainWallEOFAFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../MobiusEOFAFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../PartialFractionFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../WilsonCloverFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../WilsonFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../WilsonFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../WilsonKernelsInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../WilsonTMFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| #define IMPLEMENTATION WilsonImplD2 | ||||
| @@ -1 +0,0 @@ | ||||
| ../CayleyFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../ContinuedFractionFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../DomainWallEOFAFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../MobiusEOFAFermionInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../PartialFractionFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../WilsonFermion5DInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| ../WilsonKernelsInstantiation.cc.master | ||||
| @@ -1 +0,0 @@ | ||||
| #define IMPLEMENTATION ZWilsonImplD2 | ||||
| @@ -38,19 +38,15 @@ NAMESPACE_BEGIN(Grid); | ||||
|     // cf. GeneralEvenOddRational.h for details | ||||
|     ///////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|        | ||||
|     template<class ImplD, class ImplF, class ImplD2> | ||||
|     template<class ImplD, class ImplF> | ||||
|     class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> { | ||||
|     private: | ||||
|       typedef typename ImplD2::FermionField FermionFieldD2; | ||||
|       typedef typename ImplD::FermionField FermionFieldD; | ||||
|       typedef typename ImplF::FermionField FermionFieldF; | ||||
|  | ||||
|       FermionOperator<ImplD> & NumOpD; | ||||
|       FermionOperator<ImplD> & DenOpD; | ||||
|  | ||||
|       FermionOperator<ImplD2> & NumOpD2; | ||||
|       FermionOperator<ImplD2> & DenOpD2; | ||||
|       | ||||
|       FermionOperator<ImplF> & NumOpF; | ||||
|       FermionOperator<ImplF> & DenOpF; | ||||
|  | ||||
| @@ -64,40 +60,31 @@ NAMESPACE_BEGIN(Grid); | ||||
| 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx); | ||||
| 	msCG(schurOp,in, out); | ||||
| #else | ||||
| 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2); | ||||
| 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD); | ||||
| 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF); | ||||
| 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); | ||||
| 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); | ||||
| 	FermionFieldD inD(NumOpD.FermionRedBlackGrid()); | ||||
| 	FermionFieldD outD(NumOpD.FermionRedBlackGrid()); | ||||
|  | ||||
| 	// Action better with higher precision? | ||||
| 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||
| 	precisionChange(inD2,in); | ||||
| 	std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl; | ||||
| 	msCG(schurOpD2, inD2, outD2); | ||||
| 	precisionChange(out,outD2); | ||||
| 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||
| 	msCG(schurOpD, in, out); | ||||
| #endif | ||||
|       } | ||||
|       virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){ | ||||
| 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2); | ||||
| 	SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD); | ||||
| 	SchurDifferentiableOperator<ImplF>  schurOpF (numerator ? NumOpF  : DenOpF); | ||||
|  | ||||
| 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); | ||||
| 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); | ||||
| 	std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid()); | ||||
| 	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||
| 	precisionChange(inD2,in); | ||||
| 	std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl; | ||||
| 	msCG(schurOpD2, inD2, out_elemsD2, outD2); | ||||
| 	precisionChange(out,outD2); | ||||
| 	for(int i=0;i<out_elems.size();i++){ | ||||
| 	  precisionChange(out_elems[i],out_elemsD2[i]); | ||||
| 	} | ||||
| 	FermionFieldD inD(NumOpD.FermionRedBlackGrid()); | ||||
| 	FermionFieldD outD(NumOpD.FermionRedBlackGrid()); | ||||
| 	std::vector<FermionFieldD> out_elemsD(out_elems.size(),NumOpD.FermionRedBlackGrid()); | ||||
| 	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); | ||||
| 	msCG(schurOpD, in, out_elems, out); | ||||
|       } | ||||
|       //Allow derived classes to override the gauge import | ||||
|       virtual void ImportGauge(const typename ImplD::GaugeField &Ud){ | ||||
|  | ||||
| 	typename ImplF::GaugeField Uf(NumOpF.GaugeGrid()); | ||||
| 	typename ImplD2::GaugeField Ud2(NumOpD2.GaugeGrid()); | ||||
| 	typename ImplD::GaugeField Ud2(NumOpD.GaugeGrid()); | ||||
| 	precisionChange(Uf, Ud); | ||||
| 	precisionChange(Ud2, Ud); | ||||
|  | ||||
| @@ -109,20 +96,18 @@ NAMESPACE_BEGIN(Grid); | ||||
| 	NumOpF.ImportGauge(Uf); | ||||
| 	DenOpF.ImportGauge(Uf); | ||||
|  | ||||
| 	NumOpD2.ImportGauge(Ud2); | ||||
| 	DenOpD2.ImportGauge(Ud2); | ||||
| 	NumOpD.ImportGauge(Ud2); | ||||
| 	DenOpD.ImportGauge(Ud2); | ||||
|       } | ||||
|        | ||||
|     public: | ||||
|       GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD>  &_NumOpD, FermionOperator<ImplD>  &_DenOpD,  | ||||
| 							      FermionOperator<ImplF>  &_NumOpF, FermionOperator<ImplF>  &_DenOpF,  | ||||
| 							      FermionOperator<ImplD2>  &_NumOpD2, FermionOperator<ImplD2>  &_DenOpD2,  | ||||
| 							      const RationalActionParams & p, Integer _ReliableUpdateFreq | ||||
| 							      ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p), | ||||
| 								  ReliableUpdateFreq(_ReliableUpdateFreq), | ||||
| 								  NumOpD(_NumOpD), DenOpD(_DenOpD), | ||||
| 								  NumOpF(_NumOpF), DenOpF(_DenOpF), | ||||
| 								  NumOpD2(_NumOpD2), DenOpD2(_DenOpD2) | ||||
| 								  NumOpF(_NumOpF), DenOpF(_DenOpF) | ||||
|       {} | ||||
|        | ||||
|       virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";} | ||||
|   | ||||
| @@ -67,9 +67,9 @@ NAMESPACE_BEGIN(Grid); | ||||
|       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       | ||||
|     }; | ||||
|  | ||||
|     template<class Impl,class ImplF,class ImplD2> | ||||
|     template<class Impl,class ImplF> | ||||
|     class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction | ||||
|       : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2> { | ||||
|       : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> { | ||||
|     public: | ||||
|       typedef OneFlavourRationalParams Params; | ||||
|     private: | ||||
| @@ -91,11 +91,9 @@ NAMESPACE_BEGIN(Grid); | ||||
| 								 FermionOperator<Impl>  &_DenOp,  | ||||
| 								 FermionOperator<ImplF>  &_NumOpF,  | ||||
| 								 FermionOperator<ImplF>  &_DenOpF,  | ||||
| 								 FermionOperator<ImplD2>  &_NumOpD2,  | ||||
| 								 FermionOperator<ImplD2>  &_DenOpD2,  | ||||
| 								 const Params & p, Integer ReliableUpdateFreq | ||||
| 							) :  | ||||
| 	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2>(_NumOp, _DenOp,_NumOpF, _DenOpF,_NumOpD2, _DenOpD2, transcribe(p),ReliableUpdateFreq){} | ||||
| 	GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){} | ||||
|  | ||||
|       virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}       | ||||
|     }; | ||||
|   | ||||
| @@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid); | ||||
|         // NumOp == V | ||||
|         // DenOp == M | ||||
|         // | ||||
|     AUDIT(); | ||||
|         FermionField etaOdd (NumOp.FermionRedBlackGrid()); | ||||
|         FermionField etaEven(NumOp.FermionRedBlackGrid()); | ||||
|         FermionField tmp    (NumOp.FermionRedBlackGrid()); | ||||
|  | ||||
|     AUDIT(); | ||||
|         pickCheckerboard(Even,etaEven,eta); | ||||
|     AUDIT(); | ||||
|         pickCheckerboard(Odd,etaOdd,eta); | ||||
|  | ||||
|     AUDIT(); | ||||
|         NumOp.ImportGauge(U); | ||||
|     AUDIT(); | ||||
|         DenOp.ImportGauge(U); | ||||
| 	std::cout << " TwoFlavourRefresh:  Imported gauge "<<std::endl; | ||||
|     AUDIT(); | ||||
|  | ||||
|         SchurDifferentiableOperator<Impl> Mpc(DenOp); | ||||
|     AUDIT(); | ||||
|         SchurDifferentiableOperator<Impl> Vpc(NumOp); | ||||
|     AUDIT(); | ||||
|  | ||||
| 	std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl; | ||||
|     AUDIT(); | ||||
|         // Odd det factors | ||||
|         Mpc.MpcDag(etaOdd,PhiOdd); | ||||
|     AUDIT(); | ||||
| 	std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl; | ||||
|         tmp=Zero(); | ||||
|     AUDIT(); | ||||
| 	std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl; | ||||
|     AUDIT(); | ||||
|         HeatbathSolver(Vpc,PhiOdd,tmp); | ||||
|     AUDIT(); | ||||
| 	std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl; | ||||
|         Vpc.Mpc(tmp,PhiOdd);             | ||||
| 	std::cout << " TwoFlavourRefresh: Mpc "<<std::endl; | ||||
|   | ||||
| @@ -134,14 +134,12 @@ protected: | ||||
|       double start_force = usecond(); | ||||
|  | ||||
|       std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl; | ||||
|       AUDIT(); | ||||
|        | ||||
|       as[level].actions.at(a)->deriv_timer_start(); | ||||
|       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta | ||||
|       as[level].actions.at(a)->deriv_timer_stop(); | ||||
|  | ||||
|       std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl; | ||||
|       AUDIT(); | ||||
|  | ||||
|       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; | ||||
|       auto name = as[level].actions.at(a)->action_name(); | ||||
| @@ -382,12 +380,12 @@ public: | ||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); | ||||
|  | ||||
| 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl; | ||||
| 	AUDIT(); | ||||
|  | ||||
| 	as[level].actions.at(actionID)->refresh_timer_start(); | ||||
|         as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); | ||||
| 	as[level].actions.at(actionID)->refresh_timer_stop(); | ||||
| 	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl; | ||||
| 	AUDIT(); | ||||
|  | ||||
|       } | ||||
|  | ||||
|       // Refresh the higher representation actions | ||||
| @@ -424,7 +422,7 @@ public: | ||||
|     // Actions | ||||
|     for (int level = 0; level < as.size(); ++level) { | ||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||
| 	AUDIT(); | ||||
|  | ||||
|         // get gauge field from the SmearingPolicy and | ||||
|         // based on the boolean is_smeared in actionID | ||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); | ||||
| @@ -434,7 +432,7 @@ public: | ||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); | ||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||
|         H += Hterm; | ||||
| 	AUDIT(); | ||||
|  | ||||
|       } | ||||
|       as[level].apply(S_hireps, Representations, level, H); | ||||
|     } | ||||
| @@ -447,9 +445,9 @@ public: | ||||
|     void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) { | ||||
|        | ||||
|       for (int a = 0; a < repr_set.size(); ++a) { | ||||
| 	AUDIT(); | ||||
|  | ||||
|         RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); | ||||
| 	AUDIT(); | ||||
|  | ||||
|         std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; | ||||
|         H += Hterm; | ||||
|  | ||||
| @@ -474,10 +472,10 @@ public: | ||||
|         Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); | ||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; | ||||
| 	        as[level].actions.at(actionID)->S_timer_start(); | ||||
| 	AUDIT(); | ||||
|  | ||||
|         Hterm = as[level].actions.at(actionID)->Sinitial(Us); | ||||
|    	        as[level].actions.at(actionID)->S_timer_stop(); | ||||
| 	AUDIT(); | ||||
|  | ||||
|         std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; | ||||
|         H += Hterm; | ||||
|       } | ||||
| @@ -490,7 +488,6 @@ public: | ||||
|    | ||||
|   void integrate(Field& U)  | ||||
|   { | ||||
|     AUDIT(); | ||||
|     // reset the clocks | ||||
|     t_U = 0; | ||||
|     for (int level = 0; level < as.size(); ++level) { | ||||
| @@ -508,10 +505,8 @@ public: | ||||
|       assert(fabs(t_U - t_P[level]) < 1.0e-6);  // must be the same | ||||
|       std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; | ||||
|     } | ||||
|     AUDIT(); | ||||
|  | ||||
|     FieldImplementation::Project(U); | ||||
|     AUDIT(); | ||||
|  | ||||
|     // and that we indeed got to the end of the trajectory | ||||
|     assert(fabs(t_U - Params.trajL) < 1.0e-6); | ||||
|   | ||||
| @@ -434,7 +434,6 @@ public: | ||||
|   //////////////////////////////////////////////////////////////////////// | ||||
|   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||
|   { | ||||
|     accelerator_barrier(); | ||||
|     for(int i=0;i<Packets.size();i++){ | ||||
|       _grid->StencilSendToRecvFromBegin(MpiReqs, | ||||
| 					Packets[i].send_buf, | ||||
| @@ -443,7 +442,6 @@ public: | ||||
| 					Packets[i].from_rank,Packets[i].do_recv, | ||||
| 					Packets[i].xbytes,Packets[i].rbytes,i); | ||||
|     } | ||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||
|   } | ||||
|  | ||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||
| @@ -452,6 +450,9 @@ public: | ||||
|     if   ( this->partialDirichlet ) DslashLogPartial(); | ||||
|     else if ( this->fullDirichlet ) DslashLogDirichlet(); | ||||
|     else DslashLogFull(); | ||||
|     acceleratorCopySynchronise(); | ||||
|     // Everyone agrees we are all done | ||||
|     _grid->StencilBarrier();  | ||||
|   } | ||||
|   //////////////////////////////////////////////////////////////////////// | ||||
|   // Blocking send and receive. Either sequential or parallel. | ||||
| @@ -529,7 +530,6 @@ public: | ||||
|   { | ||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||
|  | ||||
|     // conformable(source.Grid(),_grid); | ||||
|     assert(source.Grid()==_grid); | ||||
|  | ||||
|     u_comm_offset=0; | ||||
| @@ -655,8 +655,8 @@ public: | ||||
|     CommsMerge(decompress,Mergers,Decompressions); | ||||
|   } | ||||
|   template<class decompressor>  void CommsMergeSHM(decompressor decompress) { | ||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||
|     CommsMerge(decompress,MergersSHM,DecompressionsSHM); | ||||
|     assert(MergersSHM.size()==0); | ||||
|     assert(DecompressionsSHM.size()==0); | ||||
|   } | ||||
|  | ||||
|   template<class decompressor> | ||||
| @@ -665,9 +665,11 @@ public: | ||||
|     for(int i=0;i<mm.size();i++){ | ||||
|       decompressor::MergeFace(decompress,mm[i]); | ||||
|     } | ||||
|     if ( mm.size() )    acceleratorFenceComputeStream(); | ||||
|     for(int i=0;i<dd.size();i++){ | ||||
|       decompressor::DecompressFace(decompress,dd[i]); | ||||
|     } | ||||
|     if ( dd.size() )    acceleratorFenceComputeStream(); | ||||
|   } | ||||
|   //////////////////////////////////////// | ||||
|   // Set up routines | ||||
|   | ||||
| @@ -451,7 +451,7 @@ int main(int argc, char **argv) { | ||||
|    | ||||
| #define MIXED_PRECISION | ||||
| #ifdef MIXED_PRECISION | ||||
|   std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy> *> Bdys; | ||||
|   std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys; | ||||
| #else | ||||
|   std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys; | ||||
| #endif | ||||
| @@ -526,15 +526,13 @@ int main(int argc, char **argv) { | ||||
|       Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); | ||||
|     } else { | ||||
| #ifdef MIXED_PRECISION | ||||
|       Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>( | ||||
|       Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>( | ||||
| 			   *Numerators[h],*Denominators[h], | ||||
| 			   *NumeratorsF[h],*DenominatorsF[h], | ||||
| 			   *Numerators[h],*Denominators[h], | ||||
| 			   OFRp, SP_iters) ); | ||||
|       Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>( | ||||
|       Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>( | ||||
| 			   *Numerators[h],*Denominators[h], | ||||
| 			   *NumeratorsF[h],*DenominatorsF[h], | ||||
| 			   *Numerators[h],*Denominators[h], | ||||
| 			   OFRp, SP_iters) ); | ||||
| #else | ||||
|       Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp)); | ||||
|   | ||||
| @@ -164,11 +164,6 @@ int main(int argc, char **argv) { | ||||
|   typedef MobiusEOFAFermionF FermionEOFAActionF; | ||||
|   typedef typename FermionActionF::FermionField FermionFieldF; | ||||
|  | ||||
|   typedef WilsonImplD2 FermionImplPolicyD2; | ||||
|   typedef MobiusFermionD2 FermionActionD2; | ||||
|   typedef MobiusEOFAFermionD2 FermionEOFAActionD2; | ||||
|   typedef typename FermionActionD2::FermionField FermionFieldD2; | ||||
|  | ||||
|   typedef Grid::XmlReader       Serialiser; | ||||
|  | ||||
|   //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: | ||||
| @@ -272,7 +267,6 @@ int main(int argc, char **argv) { | ||||
|   // temporarily need a gauge field | ||||
|   LatticeGaugeFieldD  U(GridPtr); U=Zero(); | ||||
|   LatticeGaugeFieldF  UF(GridPtrF); UF=Zero(); | ||||
|   LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero(); | ||||
|  | ||||
|   std::cout << GridLogMessage << " Running the HMC "<< std::endl; | ||||
|   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file | ||||
| @@ -394,15 +388,13 @@ int main(int argc, char **argv) { | ||||
|   std::vector<FermionAction *> Denominators; | ||||
|   std::vector<FermionActionF *> NumeratorsF; | ||||
|   std::vector<FermionActionF *> DenominatorsF; | ||||
|   std::vector<FermionActionD2 *> NumeratorsD2; | ||||
|   std::vector<FermionActionD2 *> DenominatorsD2; | ||||
|   std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients; | ||||
|   std::vector<MxPCG *> ActionMPCG; | ||||
|   std::vector<MxPCG *> MPCG; | ||||
|    | ||||
| #define MIXED_PRECISION | ||||
| #ifdef MIXED_PRECISION | ||||
|   std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2> *> Bdys; | ||||
|   std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys; | ||||
| #else | ||||
|   std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys; | ||||
| #endif | ||||
|   | ||||
							
								
								
									
										387
									
								
								benchmarks/Benchmark_dwf_fp32_paranoid.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										387
									
								
								benchmarks/Benchmark_dwf_fp32_paranoid.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,387 @@ | ||||
|  /************************************************************************************* | ||||
|     Grid physics library, www.github.com/paboyle/Grid | ||||
|     Source file: ./benchmarks/Benchmark_dwf.cc | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
|     Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #ifdef GRID_CUDA | ||||
| #define CUDA_PROFILE | ||||
| #endif | ||||
|  | ||||
| #ifdef CUDA_PROFILE | ||||
| #include <cuda_profiler_api.h> | ||||
| #endif | ||||
|  | ||||
| using namespace std; | ||||
| using namespace Grid; | ||||
|  | ||||
| template<class d> | ||||
| struct scal { | ||||
|   d internal; | ||||
| }; | ||||
|  | ||||
|   Gamma::Algebra Gmu [] = { | ||||
|     Gamma::Algebra::GammaX, | ||||
|     Gamma::Algebra::GammaY, | ||||
|     Gamma::Algebra::GammaZ, | ||||
|     Gamma::Algebra::GammaT | ||||
|   }; | ||||
|  | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|  | ||||
|   int threads = GridThread::GetThreads(); | ||||
|  | ||||
|   Coordinate latt4 = GridDefaultLatt(); | ||||
|   int Ls=16; | ||||
|   for(int i=0;i<argc;i++) | ||||
|     if(std::string(argv[i]) == "-Ls"){ | ||||
|       std::stringstream ss(argv[i+1]); ss >> Ls; | ||||
|     } | ||||
|  | ||||
|   GridLogLayout(); | ||||
|  | ||||
|   long unsigned int single_site_flops = 8*Nc*(7+16*Nc); | ||||
|  | ||||
|  | ||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); | ||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   std::cout << GridLogMessage << "Making s innermost grids"<<std::endl; | ||||
|   GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi()); | ||||
|   GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); | ||||
|   GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); | ||||
|   GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   std::vector<int> seeds4({1,2,3,4}); | ||||
|   std::vector<int> seeds5({5,6,7,8}); | ||||
|  | ||||
|   std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; | ||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG")); | ||||
|   std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; | ||||
|   GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG")); | ||||
|   std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||
|  | ||||
|   LatticeFermionF src   (FGrid); random(RNG5,src); | ||||
|   LatticeFermionF src1   (FGrid); random(RNG5,src1); | ||||
| #if 0 | ||||
|   src = Zero(); | ||||
|   { | ||||
|     Coordinate origin({0,0,0,latt4[2]-1,0}); | ||||
|     SpinColourVectorF tmp; | ||||
|     tmp=Zero(); | ||||
|     tmp()(0)(0)=Complex(-2.0,0.0); | ||||
|     std::cout << " source site 0 " << tmp<<std::endl; | ||||
|     pokeSite(tmp,src,origin); | ||||
|   } | ||||
| #else | ||||
|   RealD N2 = 1.0/::sqrt(norm2(src)); | ||||
|   src = src*N2; | ||||
| #endif | ||||
|  | ||||
|  | ||||
|   LatticeFermionF result(FGrid); result=Zero(); | ||||
|   LatticeFermionF    ref(FGrid);    ref=Zero(); | ||||
|   LatticeFermionF    tmp(FGrid); | ||||
|   LatticeFermionF    err(FGrid); | ||||
|  | ||||
|   std::cout << GridLogMessage << "Drawing gauge field" << std::endl; | ||||
|   LatticeGaugeFieldF Umu(UGrid); | ||||
|   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|   std::cout << GridLogMessage << "Random gauge initialised " << std::endl; | ||||
| #if 0 | ||||
|   Umu=1.0; | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|     LatticeColourMatrixF ttmp(UGrid); | ||||
|     ttmp = PeekIndex<LorentzIndex>(Umu,mu); | ||||
|     //    if (mu !=2 ) ttmp = 0; | ||||
|     //    ttmp = ttmp* pow(10.0,mu); | ||||
|     PokeIndex<LorentzIndex>(Umu,ttmp,mu); | ||||
|   } | ||||
|   std::cout << GridLogMessage << "Forced to diagonal " << std::endl; | ||||
| #endif | ||||
|  | ||||
|   //////////////////////////////////// | ||||
|   // Naive wilson implementation | ||||
|   //////////////////////////////////// | ||||
|   // replicate across fifth dimension | ||||
|   //  LatticeGaugeFieldF Umu5d(FGrid); | ||||
|   std::vector<LatticeColourMatrixF> U(4,UGrid); | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|     U[mu] = PeekIndex<LorentzIndex>(Umu,mu); | ||||
|   } | ||||
|   std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; | ||||
|  | ||||
|   if (1) | ||||
|   { | ||||
|     ref = Zero(); | ||||
|     for(int mu=0;mu<Nd;mu++){ | ||||
|  | ||||
|       tmp = Cshift(src,mu+1,1); | ||||
|       { | ||||
| 	autoView( tmp_v  , tmp  , CpuWrite); | ||||
| 	autoView( U_v  , U[mu]  , CpuRead); | ||||
| 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){ | ||||
| 	  for(int s=0;s<Ls;s++){ | ||||
| 	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s]; | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|       ref=ref + tmp - Gamma(Gmu[mu])*tmp; | ||||
|  | ||||
|       { | ||||
| 	autoView( tmp_v  , tmp  , CpuWrite); | ||||
| 	autoView( U_v  , U[mu]  , CpuRead); | ||||
| 	autoView( src_v, src    , CpuRead); | ||||
| 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){ | ||||
| 	  for(int s=0;s<Ls;s++){ | ||||
| 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s]; | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|       tmp =Cshift(tmp,mu+1,-1); | ||||
|       ref=ref + tmp + Gamma(Gmu[mu])*tmp; | ||||
|     } | ||||
|     ref = -0.5*ref; | ||||
|   } | ||||
|  | ||||
|   RealD mass=0.1; | ||||
|   RealD M5  =1.8; | ||||
|  | ||||
|   RealD NP = UGrid->_Nprocessors; | ||||
|   RealD NN = UGrid->NodeCount(); | ||||
|  | ||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl; | ||||
|   std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl; | ||||
|   if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||
|   if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||
| #ifdef GRID_OMP | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||
| #endif | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||
|  | ||||
|   DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||
|   int ncall =100; | ||||
|  | ||||
|   if (1) { | ||||
|     FGrid->Barrier(); | ||||
|     Dw.Dhop(src,result,0); | ||||
|     std::cout<<GridLogMessage<<"Called warmup"<<std::endl; | ||||
|     double t0=usecond(); | ||||
|     for(int i=0;i<ncall;i++){ | ||||
|       Dw.Dhop(src1,result,0); | ||||
|       Dw.Dhop(src,result,0); | ||||
|       err = ref-result; | ||||
|       std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||
|       assert (norm2(err)< 1.0e-4 ); | ||||
|     } | ||||
|     double t1=usecond(); | ||||
|     FGrid->Barrier(); | ||||
|  | ||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||
|     double flops=single_site_flops*volume*ncall; | ||||
|  | ||||
|     auto nsimd = vComplex::Nsimd(); | ||||
|     auto simdwidth = sizeof(vComplex); | ||||
|  | ||||
|     // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors | ||||
|     double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.); | ||||
|  | ||||
|     // mem: Nd Wilson * Ls, Nd gauge, Nc colors | ||||
|     double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.); | ||||
|  | ||||
|     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||
|     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||
|     //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl; | ||||
|     std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl; | ||||
|     err = ref-result; | ||||
|     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||
|     //exit(0); | ||||
|  | ||||
|     if(( norm2(err)>1.0e-4) ) { | ||||
|  | ||||
|       /* | ||||
|       std::cout << "RESULT\n " << result<<std::endl; | ||||
|       std::cout << "REF   \n " << ref   <<std::endl; | ||||
|       std::cout << "ERR   \n " << err   <<std::endl; | ||||
|       */ | ||||
|       std::cout<<GridLogMessage << "WRONG RESULT" << std::endl; | ||||
|       FGrid->Barrier(); | ||||
|       exit(-1); | ||||
|     } | ||||
|     assert (norm2(err)< 1.0e-4 ); | ||||
|   } | ||||
|  | ||||
|   if (1) | ||||
|   { // Naive wilson dag implementation | ||||
|     ref = Zero(); | ||||
|     for(int mu=0;mu<Nd;mu++){ | ||||
|  | ||||
|       //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x | ||||
|       tmp = Cshift(src,mu+1,1); | ||||
|       { | ||||
| 	autoView( ref_v, ref, CpuWrite); | ||||
| 	autoView( tmp_v, tmp, CpuRead); | ||||
| 	autoView( U_v  , U[mu]  , CpuRead); | ||||
| 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){ | ||||
| 	  for(int s=0;s<Ls;s++){ | ||||
| 	    int i=s+Ls*ss; | ||||
| 	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ; | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|        | ||||
|       { | ||||
| 	autoView( tmp_v  , tmp  , CpuWrite); | ||||
| 	autoView( U_v  , U[mu]  , CpuRead); | ||||
| 	autoView( src_v, src    , CpuRead); | ||||
| 	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){ | ||||
| 	  for(int s=0;s<Ls;s++){ | ||||
| 	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s]; | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|       //      tmp =adj(U[mu])*src; | ||||
|       tmp =Cshift(tmp,mu+1,-1); | ||||
|       { | ||||
| 	autoView( ref_v, ref, CpuWrite); | ||||
| 	autoView( tmp_v, tmp, CpuRead); | ||||
| 	for(int i=0;i<ref_v.size();i++){ | ||||
| 	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ; | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|     ref = -0.5*ref; | ||||
|   } | ||||
|   //  dump=1; | ||||
|   Dw.Dhop(src,result,1); | ||||
|   std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl; | ||||
|   std::cout<<GridLogMessage << "Called DwDag"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl; | ||||
|   err = ref-result; | ||||
|   std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl; | ||||
|   if((norm2(err)>1.0e-4)){ | ||||
| /* | ||||
| 	std::cout<< "DAG RESULT\n "  <<ref     << std::endl; | ||||
| 	std::cout<< "DAG sRESULT\n " <<result  << std::endl; | ||||
| 	std::cout<< "DAG ERR   \n "  << err    <<std::endl; | ||||
| */ | ||||
|   } | ||||
|   LatticeFermionF src_e (FrbGrid); | ||||
|   LatticeFermionF src_o (FrbGrid); | ||||
|   LatticeFermionF r_e   (FrbGrid); | ||||
|   LatticeFermionF r_o   (FrbGrid); | ||||
|   LatticeFermionF r_eo  (FGrid); | ||||
|  | ||||
|   std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl; | ||||
|   pickCheckerboard(Even,src_e,src); | ||||
|   pickCheckerboard(Odd,src_o,src); | ||||
|  | ||||
|   std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl; | ||||
|  | ||||
|  | ||||
|   // S-direction is INNERMOST and takes no part in the parity. | ||||
|   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl; | ||||
|   if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||
|   if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||
| #ifdef GRID_OMP | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||
| #endif | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|   { | ||||
|     FGrid->Barrier(); | ||||
|     Dw.DhopEO(src_o,r_e,DaggerNo); | ||||
|     double t0=usecond(); | ||||
|     for(int i=0;i<ncall;i++){ | ||||
| #ifdef CUDA_PROFILE | ||||
|       if(i==10) cudaProfilerStart(); | ||||
| #endif | ||||
|       Dw.DhopEO(src_o,r_e,DaggerNo); | ||||
| #ifdef CUDA_PROFILE | ||||
|       if(i==20) cudaProfilerStop(); | ||||
| #endif | ||||
|     } | ||||
|     double t1=usecond(); | ||||
|     FGrid->Barrier(); | ||||
|  | ||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||
|     double flops=(single_site_flops*volume*ncall)/2.0; | ||||
|  | ||||
|     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; | ||||
|     std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl; | ||||
|   } | ||||
|   Dw.DhopEO(src_o,r_e,DaggerNo); | ||||
|   Dw.DhopOE(src_e,r_o,DaggerNo); | ||||
|   Dw.Dhop  (src  ,result,DaggerNo); | ||||
|  | ||||
|   std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl; | ||||
|  | ||||
|   setCheckerboard(r_eo,r_o); | ||||
|   setCheckerboard(r_eo,r_e); | ||||
|  | ||||
|   err = r_eo-result; | ||||
|   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||
|   if((norm2(err)>1.0e-4)){ | ||||
|     /* | ||||
| 	std::cout<< "Deo RESULT\n " <<r_eo << std::endl; | ||||
| 	std::cout<< "Deo REF\n " <<result  << std::endl; | ||||
| 	std::cout<< "Deo ERR   \n " << err <<std::endl; | ||||
|     */ | ||||
|   } | ||||
|  | ||||
|   pickCheckerboard(Even,src_e,err); | ||||
|   pickCheckerboard(Odd,src_o,err); | ||||
|   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; | ||||
|  | ||||
|   assert(norm2(src_e)<1.0e-4); | ||||
|   assert(norm2(src_o)<1.0e-4); | ||||
|   Grid_finalize(); | ||||
|   exit(0); | ||||
| } | ||||
| @@ -1,7 +1,7 @@ | ||||
| CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` | ||||
| ../../configure --enable-comms=mpi-auto \ | ||||
| --with-lime=$CLIME \ | ||||
| --enable-unified=yes \ | ||||
| --enable-unified=no \ | ||||
| --enable-shm=nvlink \ | ||||
| --enable-tracing=timer \ | ||||
| --enable-accelerator=hip \ | ||||
|   | ||||
| @@ -5,8 +5,8 @@ module load emacs | ||||
| #module load gperftools | ||||
| module load PrgEnv-gnu | ||||
| module load rocm/5.3.0 | ||||
| module load cray-mpich/8.1.16 | ||||
| #module load cray-mpich/8.1.17 | ||||
| #module load cray-mpich/8.1.16 | ||||
| module load cray-mpich/8.1.17 | ||||
| module load gmp | ||||
| module load cray-fftw | ||||
| module load craype-accel-amd-gfx90a | ||||
|   | ||||
| @@ -4,7 +4,7 @@ | ||||
| #SBATCH -p QZ1J-ICX-PVC | ||||
| ##SBATCH -p QZ1J-SPR-PVC-2C | ||||
|  | ||||
| source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh | ||||
| #source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh | ||||
|  | ||||
| export NT=8 | ||||
|  | ||||
|   | ||||
| @@ -4,7 +4,7 @@ | ||||
|  | ||||
| #SBATCH -p QZ1J-ICX-PVC | ||||
|  | ||||
| source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh | ||||
| #source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh | ||||
|  | ||||
| export NT=16 | ||||
|  | ||||
| @@ -19,11 +19,15 @@ export SYCL_DEVICE_FILTER=gpu,level_zero | ||||
| export I_MPI_OFFLOAD_CELL=tile | ||||
| export EnableImplicitScaling=0 | ||||
| export EnableWalkerPartition=0 | ||||
| export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 | ||||
| export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||
| #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 | ||||
| #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||
| export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 | ||||
|  | ||||
| #mpiexec -launcher ssh -n 1 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log | ||||
| for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | ||||
| do | ||||
| mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 1.1.1.2.log$i | ||||
| mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 2.1.1.1.log$i  | ||||
| done | ||||
|  | ||||
| mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 | ||||
|  | ||||
|   | ||||
| @@ -5,10 +5,5 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID | ||||
| echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK | ||||
|  | ||||
|  | ||||
| #if [ $MPI_LOCALRANKID = "0" ]  | ||||
| #then | ||||
| #  ~psteinbr/build_pti/ze_tracer -c $@ | ||||
| #  onetrace --chrome-kernel-timeline $@ | ||||
| #else | ||||
|   $@ | ||||
| #fi | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user