mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet
This commit is contained in:
		@@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 | 
			
		||||
 
 | 
			
		||||
@@ -191,7 +191,7 @@ public:
 | 
			
		||||
	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
 | 
			
		||||
	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
 | 
			
		||||
 | 
			
		||||
	std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
 | 
			
		||||
	std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
 | 
			
		||||
 | 
			
		||||
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										213
									
								
								Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										213
									
								
								Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,213 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
    Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
 | 
			
		||||
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
//Mixed precision restarted defect correction CG
 | 
			
		||||
template<class FieldD,class FieldF, 
 | 
			
		||||
  typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
 | 
			
		||||
  typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 | 
			
		||||
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
 | 
			
		||||
public:
 | 
			
		||||
  using LinearFunction<FieldD>::operator();
 | 
			
		||||
  RealD   Tolerance;
 | 
			
		||||
  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
 | 
			
		||||
  Integer MaxInnerIterations;
 | 
			
		||||
  Integer MaxOuterIterations;
 | 
			
		||||
  Integer MaxPatchupIterations;
 | 
			
		||||
  GridBase* SinglePrecGrid; //Grid for single-precision fields
 | 
			
		||||
  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
 | 
			
		||||
  LinearOperatorBase<FieldF> &Linop_f;
 | 
			
		||||
  LinearOperatorBase<FieldD> &Linop_d;
 | 
			
		||||
 | 
			
		||||
  //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
 | 
			
		||||
  LinearFunction<FieldF> *guesser;
 | 
			
		||||
  bool updateResidual;
 | 
			
		||||
  
 | 
			
		||||
  MixedPrecisionConjugateGradientBatched(RealD tol, 
 | 
			
		||||
          Integer maxinnerit, 
 | 
			
		||||
          Integer maxouterit, 
 | 
			
		||||
          Integer maxpatchit,
 | 
			
		||||
          GridBase* _sp_grid, 
 | 
			
		||||
          LinearOperatorBase<FieldF> &_Linop_f, 
 | 
			
		||||
          LinearOperatorBase<FieldD> &_Linop_d,
 | 
			
		||||
          bool _updateResidual=true) :
 | 
			
		||||
    Linop_f(_Linop_f), Linop_d(_Linop_d),
 | 
			
		||||
    Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
 | 
			
		||||
    OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
 | 
			
		||||
 | 
			
		||||
  void useGuesser(LinearFunction<FieldF> &g){
 | 
			
		||||
    guesser = &g;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  void operator() (const FieldD &src_d_in, FieldD &sol_d){
 | 
			
		||||
    std::vector<FieldD> srcs_d_in{src_d_in};
 | 
			
		||||
    std::vector<FieldD> sols_d{sol_d};
 | 
			
		||||
 | 
			
		||||
    (*this)(srcs_d_in,sols_d);
 | 
			
		||||
 | 
			
		||||
    sol_d = sols_d[0];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
 | 
			
		||||
    assert(src_d_in.size() == sol_d.size());
 | 
			
		||||
    int NBatch = src_d_in.size();
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
 | 
			
		||||
 | 
			
		||||
    Integer TotalOuterIterations = 0; //Number of restarts
 | 
			
		||||
    std::vector<Integer> TotalInnerIterations(NBatch,0);     //Number of inner CG iterations
 | 
			
		||||
    std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
 | 
			
		||||
  
 | 
			
		||||
    GridStopWatch TotalTimer;
 | 
			
		||||
    TotalTimer.Start();
 | 
			
		||||
 | 
			
		||||
    GridStopWatch InnerCGtimer;
 | 
			
		||||
    GridStopWatch PrecChangeTimer;
 | 
			
		||||
    
 | 
			
		||||
    int cb = src_d_in[0].Checkerboard();
 | 
			
		||||
    
 | 
			
		||||
    std::vector<RealD> src_norm;
 | 
			
		||||
    std::vector<RealD> norm;
 | 
			
		||||
    std::vector<RealD> stop;
 | 
			
		||||
    
 | 
			
		||||
    GridBase* DoublePrecGrid = src_d_in[0].Grid();
 | 
			
		||||
    FieldD tmp_d(DoublePrecGrid);
 | 
			
		||||
    tmp_d.Checkerboard() = cb;
 | 
			
		||||
    
 | 
			
		||||
    FieldD tmp2_d(DoublePrecGrid);
 | 
			
		||||
    tmp2_d.Checkerboard() = cb;
 | 
			
		||||
 | 
			
		||||
    std::vector<FieldD> src_d;
 | 
			
		||||
    std::vector<FieldF> src_f;
 | 
			
		||||
    std::vector<FieldF> sol_f;
 | 
			
		||||
 | 
			
		||||
    for (int i=0; i<NBatch; i++) {
 | 
			
		||||
      sol_d[i].Checkerboard() = cb;
 | 
			
		||||
 | 
			
		||||
      src_norm.push_back(norm2(src_d_in[i]));
 | 
			
		||||
      norm.push_back(0.);
 | 
			
		||||
      stop.push_back(src_norm[i] * Tolerance*Tolerance);
 | 
			
		||||
 | 
			
		||||
      src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
 | 
			
		||||
 | 
			
		||||
      src_f.push_back(SinglePrecGrid);
 | 
			
		||||
      src_f[i].Checkerboard() = cb;
 | 
			
		||||
 | 
			
		||||
      sol_f.push_back(SinglePrecGrid);
 | 
			
		||||
      sol_f[i].Checkerboard() = cb;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    RealD inner_tol = InnerTolerance;
 | 
			
		||||
    
 | 
			
		||||
    ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
 | 
			
		||||
    CG_f.ErrorOnNoConverge = false;
 | 
			
		||||
    
 | 
			
		||||
    Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
 | 
			
		||||
      
 | 
			
		||||
    for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
 | 
			
		||||
      std::cout << GridLogMessage << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
 | 
			
		||||
      
 | 
			
		||||
      bool allConverged = true;
 | 
			
		||||
      
 | 
			
		||||
      for (int i=0; i<NBatch; i++) {
 | 
			
		||||
        //Compute double precision rsd and also new RHS vector.
 | 
			
		||||
        Linop_d.HermOp(sol_d[i], tmp_d);
 | 
			
		||||
        norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
 | 
			
		||||
        
 | 
			
		||||
        std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
 | 
			
		||||
 | 
			
		||||
        PrecChangeTimer.Start();
 | 
			
		||||
        precisionChange(src_f[i], src_d[i]);
 | 
			
		||||
        PrecChangeTimer.Stop();
 | 
			
		||||
        
 | 
			
		||||
        sol_f[i] = Zero();
 | 
			
		||||
      
 | 
			
		||||
        if(norm[i] > OuterLoopNormMult * stop[i]) {
 | 
			
		||||
          allConverged = false;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      if (allConverged) break;
 | 
			
		||||
 | 
			
		||||
      if (updateResidual) {
 | 
			
		||||
        RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
 | 
			
		||||
        RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
 | 
			
		||||
        while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 | 
			
		||||
        CG_f.Tolerance = inner_tol;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      //Optionally improve inner solver guess (eg using known eigenvectors)
 | 
			
		||||
      if(guesser != NULL) {
 | 
			
		||||
        (*guesser)(src_f, sol_f);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      for (int i=0; i<NBatch; i++) {
 | 
			
		||||
        //Inner CG
 | 
			
		||||
        InnerCGtimer.Start();
 | 
			
		||||
        CG_f(Linop_f, src_f[i], sol_f[i]);
 | 
			
		||||
        InnerCGtimer.Stop();
 | 
			
		||||
        TotalInnerIterations[i] += CG_f.IterationsToComplete;
 | 
			
		||||
        
 | 
			
		||||
        //Convert sol back to double and add to double prec solution
 | 
			
		||||
        PrecChangeTimer.Start();
 | 
			
		||||
        precisionChange(tmp_d, sol_f[i]);
 | 
			
		||||
        PrecChangeTimer.Stop();
 | 
			
		||||
        
 | 
			
		||||
        axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    //Final trial CG
 | 
			
		||||
    std::cout << GridLogMessage << std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    for (int i=0; i<NBatch; i++) {
 | 
			
		||||
      ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
 | 
			
		||||
      CG_d(Linop_d, src_d_in[i], sol_d[i]);
 | 
			
		||||
      TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    TotalTimer.Stop();
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage << std::endl;
 | 
			
		||||
    for (int i=0; i<NBatch; i++) {
 | 
			
		||||
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    std::cout << GridLogMessage << std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
 | 
			
		||||
    
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
/*Allocation types, saying which pointer cache should be used*/
 | 
			
		||||
#define Cpu      (0)
 | 
			
		||||
#define CpuSmall (1)
 | 
			
		||||
#define Acc      (2)
 | 
			
		||||
#define AccSmall (3)
 | 
			
		||||
#define Shared   (4)
 | 
			
		||||
#define SharedSmall (5)
 | 
			
		||||
#define CpuHuge  (1)
 | 
			
		||||
#define CpuSmall (2)
 | 
			
		||||
#define Acc      (3)
 | 
			
		||||
#define AccHuge  (4)
 | 
			
		||||
#define AccSmall (5)
 | 
			
		||||
#define Shared   (6)
 | 
			
		||||
#define SharedHuge  (7)
 | 
			
		||||
#define SharedSmall (8)
 | 
			
		||||
#undef GRID_MM_VERBOSE 
 | 
			
		||||
uint64_t total_shared;
 | 
			
		||||
uint64_t total_device;
 | 
			
		||||
@@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void)
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
 | 
			
		||||
uint64_t MemoryManager::HostCacheBytes()   { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Data tables for recently freed pooiniter caches
 | 
			
		||||
//////////////////////////////////////////////////////////////////////
 | 
			
		||||
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 | 
			
		||||
int MemoryManager::Victim[MemoryManager::NallocType];
 | 
			
		||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
 | 
			
		||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
 | 
			
		||||
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 | 
			
		||||
//////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Actual allocation and deallocation utils
 | 
			
		||||
@@ -170,6 +176,16 @@ void MemoryManager::Init(void)
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  str= getenv("GRID_ALLOC_NCACHE_HUGE");
 | 
			
		||||
  if ( str ) {
 | 
			
		||||
    Nc = atoi(str);
 | 
			
		||||
    if ( (Nc>=0) && (Nc < NallocCacheMax)) {
 | 
			
		||||
      Ncache[CpuHuge]=Nc;
 | 
			
		||||
      Ncache[AccHuge]=Nc;
 | 
			
		||||
      Ncache[SharedHuge]=Nc;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  str= getenv("GRID_ALLOC_NCACHE_SMALL");
 | 
			
		||||
  if ( str ) {
 | 
			
		||||
    Nc = atoi(str);
 | 
			
		||||
@@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) {
 | 
			
		||||
  
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
 | 
			
		||||
#ifdef ALLOCATION_CACHE
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host   allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
  
 | 
			
		||||
#ifdef GRID_UVM
 | 
			
		||||
@@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) {
 | 
			
		||||
void *MemoryManager::Insert(void *ptr,size_t bytes,int type) 
 | 
			
		||||
{
 | 
			
		||||
#ifdef ALLOCATION_CACHE
 | 
			
		||||
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
 | 
			
		||||
  int cache = type + small;
 | 
			
		||||
  int cache;
 | 
			
		||||
  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
 | 
			
		||||
  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
 | 
			
		||||
  else                                     cache = type;
 | 
			
		||||
 | 
			
		||||
  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 | 
			
		||||
#else
 | 
			
		||||
  return ptr;
 | 
			
		||||
@@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 | 
			
		||||
 | 
			
		||||
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 | 
			
		||||
{
 | 
			
		||||
  assert(ncache>0);
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  assert(omp_in_parallel()==0);
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
  if (ncache == 0) return ptr;
 | 
			
		||||
 | 
			
		||||
  void * ret = NULL;
 | 
			
		||||
  int v = -1;
 | 
			
		||||
 | 
			
		||||
@@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 | 
			
		||||
void *MemoryManager::Lookup(size_t bytes,int type)
 | 
			
		||||
{
 | 
			
		||||
#ifdef ALLOCATION_CACHE
 | 
			
		||||
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
 | 
			
		||||
  int cache = type+small;
 | 
			
		||||
  int cache;
 | 
			
		||||
  if      (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
 | 
			
		||||
  else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
 | 
			
		||||
  else                                     cache = type;
 | 
			
		||||
 | 
			
		||||
  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 | 
			
		||||
#else
 | 
			
		||||
  return NULL;
 | 
			
		||||
@@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 | 
			
		||||
 | 
			
		||||
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 | 
			
		||||
{
 | 
			
		||||
  assert(ncache>0);
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  assert(omp_in_parallel()==0);
 | 
			
		||||
#endif 
 | 
			
		||||
 
 | 
			
		||||
@@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
// Move control to configure.ac and Config.h?
 | 
			
		||||
 | 
			
		||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
 | 
			
		||||
#define GRID_ALLOC_HUGE_LIMIT  (2147483648)
 | 
			
		||||
 | 
			
		||||
#define STRINGIFY(x) #x
 | 
			
		||||
#define TOSTRING(x) STRINGIFY(x)
 | 
			
		||||
@@ -70,6 +71,21 @@ enum ViewMode {
 | 
			
		||||
  CpuWriteDiscard = 0x10 // same for now
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct MemoryStatus {
 | 
			
		||||
  uint64_t     DeviceBytes;
 | 
			
		||||
  uint64_t     DeviceLRUBytes;
 | 
			
		||||
  uint64_t     DeviceMaxBytes;
 | 
			
		||||
  uint64_t     HostToDeviceBytes;
 | 
			
		||||
  uint64_t     DeviceToHostBytes;
 | 
			
		||||
  uint64_t     HostToDeviceXfer;
 | 
			
		||||
  uint64_t     DeviceToHostXfer;
 | 
			
		||||
  uint64_t     DeviceEvictions;
 | 
			
		||||
  uint64_t     DeviceDestroy;
 | 
			
		||||
  uint64_t     DeviceAllocCacheBytes;
 | 
			
		||||
  uint64_t     HostAllocCacheBytes;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MemoryManager {
 | 
			
		||||
private:
 | 
			
		||||
 | 
			
		||||
@@ -83,7 +99,7 @@ private:
 | 
			
		||||
  } AllocationCacheEntry;
 | 
			
		||||
 | 
			
		||||
  static const int NallocCacheMax=128; 
 | 
			
		||||
  static const int NallocType=6;
 | 
			
		||||
  static const int NallocType=9;
 | 
			
		||||
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
 | 
			
		||||
  static int Victim[NallocType];
 | 
			
		||||
  static int Ncache[NallocType];
 | 
			
		||||
@@ -122,6 +138,25 @@ private:
 | 
			
		||||
  static uint64_t     DeviceEvictions;
 | 
			
		||||
  static uint64_t     DeviceDestroy;
 | 
			
		||||
  
 | 
			
		||||
  static uint64_t     DeviceCacheBytes();
 | 
			
		||||
  static uint64_t     HostCacheBytes();
 | 
			
		||||
 | 
			
		||||
  static MemoryStatus GetFootprint(void) {
 | 
			
		||||
    MemoryStatus stat;
 | 
			
		||||
    stat.DeviceBytes       = DeviceBytes;
 | 
			
		||||
    stat.DeviceLRUBytes    = DeviceLRUBytes;
 | 
			
		||||
    stat.DeviceMaxBytes    = DeviceMaxBytes;
 | 
			
		||||
    stat.HostToDeviceBytes = HostToDeviceBytes;
 | 
			
		||||
    stat.DeviceToHostBytes = DeviceToHostBytes;
 | 
			
		||||
    stat.HostToDeviceXfer  = HostToDeviceXfer;
 | 
			
		||||
    stat.DeviceToHostXfer  = DeviceToHostXfer;
 | 
			
		||||
    stat.DeviceEvictions   = DeviceEvictions;
 | 
			
		||||
    stat.DeviceDestroy     = DeviceDestroy;
 | 
			
		||||
    stat.DeviceAllocCacheBytes = DeviceCacheBytes();
 | 
			
		||||
    stat.HostAllocCacheBytes   = HostCacheBytes();
 | 
			
		||||
    return stat;
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
 private:
 | 
			
		||||
#ifndef GRID_UVM
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 | 
			
		||||
{
 | 
			
		||||
  acceleratorCopySynchronise();
 | 
			
		||||
  StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
 | 
			
		||||
  int nreq=list.size();
 | 
			
		||||
 | 
			
		||||
  if (nreq==0) return;
 | 
			
		||||
 
 | 
			
		||||
@@ -37,10 +37,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
 | 
			
		||||
#ifdef GRID_HIP
 | 
			
		||||
#include <hip/hip_runtime_api.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCl
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
#define GRID_SYCL_LEVEL_ZERO_IPC
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid); 
 | 
			
		||||
#define header "SharedMemoryMpi: "
 | 
			
		||||
/*Construct from an MPI communicator*/
 | 
			
		||||
 
 | 
			
		||||
@@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
 | 
			
		||||
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
 | 
			
		||||
{
 | 
			
		||||
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
 | 
			
		||||
    if (idx >= e1*e2) return;
 | 
			
		||||
 | 
			
		||||
    int n, b, o;
 | 
			
		||||
 | 
			
		||||
    n = idx / e2;
 | 
			
		||||
    b = idx % e2;
 | 
			
		||||
    o = n*stride + b;
 | 
			
		||||
 | 
			
		||||
    vector[2*idx + 0] = lo + o;
 | 
			
		||||
    vector[2*idx + 1] = ro + o;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
// local to node block strided copies
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
@@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
 | 
			
		||||
  int ent=0;
 | 
			
		||||
 | 
			
		||||
  if(cbmask == 0x3 ){
 | 
			
		||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
 | 
			
		||||
    ent = e1*e2;
 | 
			
		||||
    dim3 blockSize(acceleratorThreads());
 | 
			
		||||
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
 | 
			
		||||
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
 | 
			
		||||
    accelerator_barrier();
 | 
			
		||||
#else
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
        int o =n*stride+b;
 | 
			
		||||
	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
  } else { 
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
@@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
 | 
			
		||||
  int ent=0;
 | 
			
		||||
 | 
			
		||||
  if ( cbmask == 0x3 ) {
 | 
			
		||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
 | 
			
		||||
    ent = e1*e2;
 | 
			
		||||
    dim3 blockSize(acceleratorThreads());
 | 
			
		||||
    dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
 | 
			
		||||
    populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
 | 
			
		||||
    accelerator_barrier();
 | 
			
		||||
#else
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    for(int b=0;b<e2;b++){
 | 
			
		||||
      int o  =n*stride;
 | 
			
		||||
      Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
 | 
			
		||||
    }}
 | 
			
		||||
#endif
 | 
			
		||||
  } else {
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
    for(int b=0;b<e2;b++){
 | 
			
		||||
 
 | 
			
		||||
@@ -153,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 | 
			
		||||
inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
 | 
			
		||||
{
 | 
			
		||||
  Integer osites = arg.Grid()->oSites();
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
 | 
			
		||||
  typename vobj::scalar_object ssum;
 | 
			
		||||
  autoView( arg_v, arg, AcceleratorRead);
 | 
			
		||||
  ssum= sum_gpu(&arg_v[0],osites);
 | 
			
		||||
  return sum_gpu(&arg_v[0],osites);
 | 
			
		||||
#else
 | 
			
		||||
  autoView(arg_v, arg, CpuRead);
 | 
			
		||||
  auto ssum= sum_cpu(&arg_v[0],osites);
 | 
			
		||||
  return sum_cpu(&arg_v[0],osites);
 | 
			
		||||
#endif  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 | 
			
		||||
{
 | 
			
		||||
  auto ssum = rankSum(arg);
 | 
			
		||||
  arg.Grid()->GlobalSum(ssum);
 | 
			
		||||
  return ssum;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 | 
			
		||||
inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
 | 
			
		||||
{
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
 | 
			
		||||
  autoView( arg_v, arg, AcceleratorRead);
 | 
			
		||||
  Integer osites = arg.Grid()->oSites();
 | 
			
		||||
  auto ssum= sum_gpu_large(&arg_v[0],osites);
 | 
			
		||||
  return sum_gpu_large(&arg_v[0],osites);
 | 
			
		||||
#else
 | 
			
		||||
  autoView(arg_v, arg, CpuRead);
 | 
			
		||||
  Integer osites = arg.Grid()->oSites();
 | 
			
		||||
  auto ssum= sum_cpu(&arg_v[0],osites);
 | 
			
		||||
  return sum_cpu(&arg_v[0],osites);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 | 
			
		||||
{
 | 
			
		||||
  auto ssum = rankSumLarge(arg);
 | 
			
		||||
  arg.Grid()->GlobalSum(ssum);
 | 
			
		||||
  return ssum;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -211,25 +211,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
 | 
			
		||||
  assert(ok);
 | 
			
		||||
 | 
			
		||||
  Integer smemSize = numThreads * sizeof(sobj);
 | 
			
		||||
  // UVM seems to be buggy under later CUDA drivers
 | 
			
		||||
  // This fails on A100 and driver 5.30.02 / CUDA 12.1
 | 
			
		||||
  // Fails with multiple NVCC versions back to 11.4,
 | 
			
		||||
  // which worked with earlier drivers.
 | 
			
		||||
  // Not sure which driver had first fail and this bears checking
 | 
			
		||||
  // Is awkward as must install multiple driver versions
 | 
			
		||||
  // Move out of UVM
 | 
			
		||||
  // Turns out I had messed up the synchronise after move to compute stream
 | 
			
		||||
  // as running this on the default stream fools the synchronise
 | 
			
		||||
#undef UVM_BLOCK_BUFFER  
 | 
			
		||||
#ifndef UVM_BLOCK_BUFFER  
 | 
			
		||||
  commVector<sobj> buffer(numBlocks);
 | 
			
		||||
  sobj *buffer_v = &buffer[0];
 | 
			
		||||
  sobj result;
 | 
			
		||||
  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
 | 
			
		||||
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
 | 
			
		||||
  accelerator_barrier();
 | 
			
		||||
  acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
 | 
			
		||||
#else
 | 
			
		||||
  Vector<sobj> buffer(numBlocks);
 | 
			
		||||
  sobj *buffer_v = &buffer[0];
 | 
			
		||||
  sobj result;
 | 
			
		||||
  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
 | 
			
		||||
  reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
 | 
			
		||||
  accelerator_barrier();
 | 
			
		||||
  result = *buffer_v;
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -440,17 +440,8 @@ public:
 | 
			
		||||
	_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
 | 
			
		||||
 | 
			
		||||
	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 | 
			
		||||
#if 1
 | 
			
		||||
	assert(rank == _grid->ThisRank() );
 | 
			
		||||
#else
 | 
			
		||||
// 
 | 
			
		||||
	if (rank != _grid->ThisRank() ){
 | 
			
		||||
	std::cout <<"rank "<<rank<<" _grid->ThisRank() "<<_grid->ThisRank()<< std::endl;
 | 
			
		||||
//	exit(-42);
 | 
			
		||||
//	assert(0);
 | 
			
		||||
	}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
	assert(rank == _grid->ThisRank() );
 | 
			
		||||
	
 | 
			
		||||
	int l_idx=generator_idx(o_idx,i_idx);
 | 
			
		||||
	_generators[l_idx] = master_engine;
 | 
			
		||||
 
 | 
			
		||||
@@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
template<class vobj,class CComplex,int nbasis,class VLattice>
 | 
			
		||||
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
 | 
			
		||||
                               const std::vector<Lattice<vobj>> &fineData,
 | 
			
		||||
                               const VLattice &Basis)
 | 
			
		||||
{
 | 
			
		||||
  int NBatch = fineData.size();
 | 
			
		||||
  assert(coarseData.size() == NBatch);
 | 
			
		||||
 | 
			
		||||
  GridBase * fine  = fineData[0].Grid();
 | 
			
		||||
  GridBase * coarse= coarseData[0].Grid();
 | 
			
		||||
 | 
			
		||||
  Lattice<iScalar<CComplex>> ip(coarse);
 | 
			
		||||
  std::vector<Lattice<vobj>> fineDataCopy = fineData;
 | 
			
		||||
 | 
			
		||||
  autoView(ip_, ip, AcceleratorWrite);
 | 
			
		||||
  for(int v=0;v<nbasis;v++) {
 | 
			
		||||
    for (int k=0; k<NBatch; k++) {
 | 
			
		||||
      autoView( coarseData_ , coarseData[k], AcceleratorWrite);
 | 
			
		||||
      blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
 | 
			
		||||
      accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
 | 
			
		||||
        convertType(coarseData_[sc](v),ip_[sc]);
 | 
			
		||||
      });
 | 
			
		||||
 | 
			
		||||
      // improve numerical stability of projection
 | 
			
		||||
      // |fine> = |fine> - <basis|fine> |basis>
 | 
			
		||||
      ip=-ip;
 | 
			
		||||
      blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]); 
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj,class vobj2,class CComplex>
 | 
			
		||||
  inline void blockZAXPY(Lattice<vobj> &fineZ,
 | 
			
		||||
@@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
template<class vobj,class CComplex,int nbasis,class VLattice>
 | 
			
		||||
inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
 | 
			
		||||
                               std::vector<Lattice<vobj>> &fineData,
 | 
			
		||||
                               const VLattice &Basis)
 | 
			
		||||
{
 | 
			
		||||
  int NBatch = coarseData.size();
 | 
			
		||||
  assert(fineData.size() == NBatch);
 | 
			
		||||
 | 
			
		||||
  GridBase * fine   = fineData[0].Grid();
 | 
			
		||||
  GridBase * coarse = coarseData[0].Grid();
 | 
			
		||||
  for (int k=0; k<NBatch; k++)
 | 
			
		||||
    fineData[k]=Zero();
 | 
			
		||||
  for (int i=0;i<nbasis;i++) {
 | 
			
		||||
    for (int k=0; k<NBatch; k++) {
 | 
			
		||||
      Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
 | 
			
		||||
      blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 | 
			
		||||
// Simd layouts need not match since we use peek/poke Local
 | 
			
		||||
template<class vobj,class vvobj>
 | 
			
		||||
 
 | 
			
		||||
@@ -463,11 +463,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 | 
			
		||||
 | 
			
		||||
   if( interior && exterior ) {
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 | 
			
		||||
#ifdef SYCL_HACK     
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl);    return; }
 | 
			
		||||
#else
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 | 
			
		||||
#endif     
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 | 
			
		||||
#endif
 | 
			
		||||
@@ -478,6 +474,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 | 
			
		||||
#endif
 | 
			
		||||
   } else if( exterior ) {
 | 
			
		||||
     acceleratorFenceComputeStream();
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
@@ -502,10 +499,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 | 
			
		||||
#endif
 | 
			
		||||
     acceleratorFenceComputeStream();
 | 
			
		||||
   } else if( interior ) {
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt);    return;}
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 | 
			
		||||
#endif
 | 
			
		||||
@@ -516,7 +512,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 | 
			
		||||
#ifndef GRID_CUDA
 | 
			
		||||
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 | 
			
		||||
#endif
 | 
			
		||||
     acceleratorFenceComputeStream();
 | 
			
		||||
   }
 | 
			
		||||
   assert(0 && " Kernel optimisation case not covered ");
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../CayleyFermion5DInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../ContinuedFractionFermion5DInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../DomainWallEOFAFermionInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../MobiusEOFAFermionInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../PartialFractionFermion5DInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../WilsonCloverFermionInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../WilsonFermion5DInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../WilsonFermionInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../WilsonKernelsInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../WilsonTMFermionInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
#define IMPLEMENTATION WilsonImplD2
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../CayleyFermion5DInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../ContinuedFractionFermion5DInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../DomainWallEOFAFermionInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../MobiusEOFAFermionInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../PartialFractionFermion5DInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../WilsonFermion5DInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../WilsonKernelsInstantiation.cc.master
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
#define IMPLEMENTATION ZWilsonImplD2
 | 
			
		||||
@@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
        // NumOp == V
 | 
			
		||||
        // DenOp == M
 | 
			
		||||
        //
 | 
			
		||||
    AUDIT();
 | 
			
		||||
        FermionField etaOdd (NumOp.FermionRedBlackGrid());
 | 
			
		||||
        FermionField etaEven(NumOp.FermionRedBlackGrid());
 | 
			
		||||
        FermionField tmp    (NumOp.FermionRedBlackGrid());
 | 
			
		||||
 | 
			
		||||
    AUDIT();
 | 
			
		||||
        pickCheckerboard(Even,etaEven,eta);
 | 
			
		||||
    AUDIT();
 | 
			
		||||
        pickCheckerboard(Odd,etaOdd,eta);
 | 
			
		||||
 | 
			
		||||
    AUDIT();
 | 
			
		||||
        NumOp.ImportGauge(U);
 | 
			
		||||
    AUDIT();
 | 
			
		||||
        DenOp.ImportGauge(U);
 | 
			
		||||
	std::cout << " TwoFlavourRefresh:  Imported gauge "<<std::endl;
 | 
			
		||||
    AUDIT();
 | 
			
		||||
 | 
			
		||||
        SchurDifferentiableOperator<Impl> Mpc(DenOp);
 | 
			
		||||
    AUDIT();
 | 
			
		||||
        SchurDifferentiableOperator<Impl> Vpc(NumOp);
 | 
			
		||||
    AUDIT();
 | 
			
		||||
 | 
			
		||||
	std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl;
 | 
			
		||||
    AUDIT();
 | 
			
		||||
        // Odd det factors
 | 
			
		||||
        Mpc.MpcDag(etaOdd,PhiOdd);
 | 
			
		||||
    AUDIT();
 | 
			
		||||
	std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl;
 | 
			
		||||
        tmp=Zero();
 | 
			
		||||
    AUDIT();
 | 
			
		||||
	std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl;
 | 
			
		||||
    AUDIT();
 | 
			
		||||
        HeatbathSolver(Vpc,PhiOdd,tmp);
 | 
			
		||||
    AUDIT();
 | 
			
		||||
	std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl;
 | 
			
		||||
        Vpc.Mpc(tmp,PhiOdd);            
 | 
			
		||||
	std::cout << " TwoFlavourRefresh: Mpc "<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -134,14 +134,12 @@ protected:
 | 
			
		||||
      double start_force = usecond();
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
 | 
			
		||||
      AUDIT();
 | 
			
		||||
      
 | 
			
		||||
      as[level].actions.at(a)->deriv_timer_start();
 | 
			
		||||
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
 | 
			
		||||
      as[level].actions.at(a)->deriv_timer_stop();
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
 | 
			
		||||
      AUDIT();
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
 | 
			
		||||
      auto name = as[level].actions.at(a)->action_name();
 | 
			
		||||
@@ -382,12 +380,12 @@ public:
 | 
			
		||||
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
 | 
			
		||||
 | 
			
		||||
	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
 | 
			
		||||
	AUDIT();
 | 
			
		||||
 | 
			
		||||
	as[level].actions.at(actionID)->refresh_timer_start();
 | 
			
		||||
        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
 | 
			
		||||
	as[level].actions.at(actionID)->refresh_timer_stop();
 | 
			
		||||
	std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;
 | 
			
		||||
	AUDIT();
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // Refresh the higher representation actions
 | 
			
		||||
@@ -424,7 +422,7 @@ public:
 | 
			
		||||
    // Actions
 | 
			
		||||
    for (int level = 0; level < as.size(); ++level) {
 | 
			
		||||
      for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
 | 
			
		||||
	AUDIT();
 | 
			
		||||
 | 
			
		||||
        // get gauge field from the SmearingPolicy and
 | 
			
		||||
        // based on the boolean is_smeared in actionID
 | 
			
		||||
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
 | 
			
		||||
@@ -434,7 +432,7 @@ public:
 | 
			
		||||
   	        as[level].actions.at(actionID)->S_timer_stop();
 | 
			
		||||
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
 | 
			
		||||
        H += Hterm;
 | 
			
		||||
	AUDIT();
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      as[level].apply(S_hireps, Representations, level, H);
 | 
			
		||||
    }
 | 
			
		||||
@@ -447,9 +445,9 @@ public:
 | 
			
		||||
    void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) {
 | 
			
		||||
      
 | 
			
		||||
      for (int a = 0; a < repr_set.size(); ++a) {
 | 
			
		||||
	AUDIT();
 | 
			
		||||
 | 
			
		||||
        RealD Hterm = repr_set.at(a)->Sinitial(Rep.U);
 | 
			
		||||
	AUDIT();
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl;
 | 
			
		||||
        H += Hterm;
 | 
			
		||||
 | 
			
		||||
@@ -474,10 +472,10 @@ public:
 | 
			
		||||
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
 | 
			
		||||
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
 | 
			
		||||
	        as[level].actions.at(actionID)->S_timer_start();
 | 
			
		||||
	AUDIT();
 | 
			
		||||
 | 
			
		||||
        Hterm = as[level].actions.at(actionID)->Sinitial(Us);
 | 
			
		||||
   	        as[level].actions.at(actionID)->S_timer_stop();
 | 
			
		||||
	AUDIT();
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
 | 
			
		||||
        H += Hterm;
 | 
			
		||||
      }
 | 
			
		||||
@@ -490,7 +488,6 @@ public:
 | 
			
		||||
  
 | 
			
		||||
  void integrate(Field& U) 
 | 
			
		||||
  {
 | 
			
		||||
    AUDIT();
 | 
			
		||||
    // reset the clocks
 | 
			
		||||
    t_U = 0;
 | 
			
		||||
    for (int level = 0; level < as.size(); ++level) {
 | 
			
		||||
@@ -508,10 +505,8 @@ public:
 | 
			
		||||
      assert(fabs(t_U - t_P[level]) < 1.0e-6);  // must be the same
 | 
			
		||||
      std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    AUDIT();
 | 
			
		||||
 | 
			
		||||
    FieldImplementation::Project(U);
 | 
			
		||||
    AUDIT();
 | 
			
		||||
 | 
			
		||||
    // and that we indeed got to the end of the trajectory
 | 
			
		||||
    assert(fabs(t_U - Params.trajL) < 1.0e-6);
 | 
			
		||||
 
 | 
			
		||||
@@ -434,7 +434,6 @@ public:
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
  {
 | 
			
		||||
    accelerator_barrier();
 | 
			
		||||
    for(int i=0;i<Packets.size();i++){
 | 
			
		||||
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 | 
			
		||||
					Packets[i].send_buf,
 | 
			
		||||
@@ -443,7 +442,6 @@ public:
 | 
			
		||||
					Packets[i].from_rank,Packets[i].do_recv,
 | 
			
		||||
					Packets[i].xbytes,Packets[i].rbytes,i);
 | 
			
		||||
    }
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
@@ -452,6 +450,9 @@ public:
 | 
			
		||||
    if   ( this->partialDirichlet ) DslashLogPartial();
 | 
			
		||||
    else if ( this->fullDirichlet ) DslashLogDirichlet();
 | 
			
		||||
    else DslashLogFull();
 | 
			
		||||
    acceleratorCopySynchronise();
 | 
			
		||||
    // Everyone agrees we are all done
 | 
			
		||||
    _grid->StencilBarrier(); 
 | 
			
		||||
  }
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Blocking send and receive. Either sequential or parallel.
 | 
			
		||||
@@ -529,7 +530,6 @@ public:
 | 
			
		||||
  {
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
 | 
			
		||||
    // conformable(source.Grid(),_grid);
 | 
			
		||||
    assert(source.Grid()==_grid);
 | 
			
		||||
 | 
			
		||||
    u_comm_offset=0;
 | 
			
		||||
@@ -655,8 +655,8 @@ public:
 | 
			
		||||
    CommsMerge(decompress,Mergers,Decompressions);
 | 
			
		||||
  }
 | 
			
		||||
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
    CommsMerge(decompress,MergersSHM,DecompressionsSHM);
 | 
			
		||||
    assert(MergersSHM.size()==0);
 | 
			
		||||
    assert(DecompressionsSHM.size()==0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class decompressor>
 | 
			
		||||
@@ -665,9 +665,11 @@ public:
 | 
			
		||||
    for(int i=0;i<mm.size();i++){
 | 
			
		||||
      decompressor::MergeFace(decompress,mm[i]);
 | 
			
		||||
    }
 | 
			
		||||
    if ( mm.size() )    acceleratorFenceComputeStream();
 | 
			
		||||
    for(int i=0;i<dd.size();i++){
 | 
			
		||||
      decompressor::DecompressFace(decompress,dd[i]);
 | 
			
		||||
    }
 | 
			
		||||
    if ( dd.size() )    acceleratorFenceComputeStream();
 | 
			
		||||
  }
 | 
			
		||||
  ////////////////////////////////////////
 | 
			
		||||
  // Set up routines
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										387
									
								
								benchmarks/Benchmark_dwf_fp32_paranoid.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										387
									
								
								benchmarks/Benchmark_dwf_fp32_paranoid.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,387 @@
 | 
			
		||||
 /*************************************************************************************
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
    Source file: ./benchmarks/Benchmark_dwf.cc
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
#define CUDA_PROFILE
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef CUDA_PROFILE
 | 
			
		||||
#include <cuda_profiler_api.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
  Gamma::Algebra Gmu [] = {
 | 
			
		||||
    Gamma::Algebra::GammaX,
 | 
			
		||||
    Gamma::Algebra::GammaY,
 | 
			
		||||
    Gamma::Algebra::GammaZ,
 | 
			
		||||
    Gamma::Algebra::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
 | 
			
		||||
  Coordinate latt4 = GridDefaultLatt();
 | 
			
		||||
  int Ls=16;
 | 
			
		||||
  for(int i=0;i<argc;i++)
 | 
			
		||||
    if(std::string(argv[i]) == "-Ls"){
 | 
			
		||||
      std::stringstream ss(argv[i+1]); ss >> Ls;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  GridLogLayout();
 | 
			
		||||
 | 
			
		||||
  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
 | 
			
		||||
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
 | 
			
		||||
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
 | 
			
		||||
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
 | 
			
		||||
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
  LatticeFermionF src   (FGrid); random(RNG5,src);
 | 
			
		||||
  LatticeFermionF src1   (FGrid); random(RNG5,src1);
 | 
			
		||||
#if 0
 | 
			
		||||
  src = Zero();
 | 
			
		||||
  {
 | 
			
		||||
    Coordinate origin({0,0,0,latt4[2]-1,0});
 | 
			
		||||
    SpinColourVectorF tmp;
 | 
			
		||||
    tmp=Zero();
 | 
			
		||||
    tmp()(0)(0)=Complex(-2.0,0.0);
 | 
			
		||||
    std::cout << " source site 0 " << tmp<<std::endl;
 | 
			
		||||
    pokeSite(tmp,src,origin);
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  RealD N2 = 1.0/::sqrt(norm2(src));
 | 
			
		||||
  src = src*N2;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  LatticeFermionF result(FGrid); result=Zero();
 | 
			
		||||
  LatticeFermionF    ref(FGrid);    ref=Zero();
 | 
			
		||||
  LatticeFermionF    tmp(FGrid);
 | 
			
		||||
  LatticeFermionF    err(FGrid);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
 | 
			
		||||
  LatticeGaugeFieldF Umu(UGrid);
 | 
			
		||||
  SU<Nc>::HotConfiguration(RNG4,Umu);
 | 
			
		||||
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 | 
			
		||||
#if 0
 | 
			
		||||
  Umu=1.0;
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    LatticeColourMatrixF ttmp(UGrid);
 | 
			
		||||
    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
    //    if (mu !=2 ) ttmp = 0;
 | 
			
		||||
    //    ttmp = ttmp* pow(10.0,mu);
 | 
			
		||||
    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  // Naive wilson implementation
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  // replicate across fifth dimension
 | 
			
		||||
  //  LatticeGaugeFieldF Umu5d(FGrid);
 | 
			
		||||
  std::vector<LatticeColourMatrixF> U(4,UGrid);
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
 | 
			
		||||
 | 
			
		||||
  if (1)
 | 
			
		||||
  {
 | 
			
		||||
    ref = Zero();
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
 | 
			
		||||
      tmp = Cshift(src,mu+1,1);
 | 
			
		||||
      {
 | 
			
		||||
	autoView( tmp_v  , tmp  , CpuWrite);
 | 
			
		||||
	autoView( U_v  , U[mu]  , CpuRead);
 | 
			
		||||
	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 | 
			
		||||
	  for(int s=0;s<Ls;s++){
 | 
			
		||||
	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 | 
			
		||||
 | 
			
		||||
      {
 | 
			
		||||
	autoView( tmp_v  , tmp  , CpuWrite);
 | 
			
		||||
	autoView( U_v  , U[mu]  , CpuRead);
 | 
			
		||||
	autoView( src_v, src    , CpuRead);
 | 
			
		||||
	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 | 
			
		||||
	  for(int s=0;s<Ls;s++){
 | 
			
		||||
	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      tmp =Cshift(tmp,mu+1,-1);
 | 
			
		||||
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
 | 
			
		||||
    }
 | 
			
		||||
    ref = -0.5*ref;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
  RealD NP = UGrid->_Nprocessors;
 | 
			
		||||
  RealD NN = UGrid->NodeCount();
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
 | 
			
		||||
  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
  int ncall =100;
 | 
			
		||||
 | 
			
		||||
  if (1) {
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
    Dw.Dhop(src,result,0);
 | 
			
		||||
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      Dw.Dhop(src1,result,0);
 | 
			
		||||
      Dw.Dhop(src,result,0);
 | 
			
		||||
      err = ref-result;
 | 
			
		||||
      std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
      assert (norm2(err)< 1.0e-4 );
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
 | 
			
		||||
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
    double flops=single_site_flops*volume*ncall;
 | 
			
		||||
 | 
			
		||||
    auto nsimd = vComplex::Nsimd();
 | 
			
		||||
    auto simdwidth = sizeof(vComplex);
 | 
			
		||||
 | 
			
		||||
    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
 | 
			
		||||
    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
 | 
			
		||||
 | 
			
		||||
    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
 | 
			
		||||
    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
 | 
			
		||||
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
 | 
			
		||||
    err = ref-result;
 | 
			
		||||
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
    //exit(0);
 | 
			
		||||
 | 
			
		||||
    if(( norm2(err)>1.0e-4) ) {
 | 
			
		||||
 | 
			
		||||
      /*
 | 
			
		||||
      std::cout << "RESULT\n " << result<<std::endl;
 | 
			
		||||
      std::cout << "REF   \n " << ref   <<std::endl;
 | 
			
		||||
      std::cout << "ERR   \n " << err   <<std::endl;
 | 
			
		||||
      */
 | 
			
		||||
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
 | 
			
		||||
      FGrid->Barrier();
 | 
			
		||||
      exit(-1);
 | 
			
		||||
    }
 | 
			
		||||
    assert (norm2(err)< 1.0e-4 );
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (1)
 | 
			
		||||
  { // Naive wilson dag implementation
 | 
			
		||||
    ref = Zero();
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
 | 
			
		||||
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
 | 
			
		||||
      tmp = Cshift(src,mu+1,1);
 | 
			
		||||
      {
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( tmp_v, tmp, CpuRead);
 | 
			
		||||
	autoView( U_v  , U[mu]  , CpuRead);
 | 
			
		||||
	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 | 
			
		||||
	  for(int s=0;s<Ls;s++){
 | 
			
		||||
	    int i=s+Ls*ss;
 | 
			
		||||
	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      {
 | 
			
		||||
	autoView( tmp_v  , tmp  , CpuWrite);
 | 
			
		||||
	autoView( U_v  , U[mu]  , CpuRead);
 | 
			
		||||
	autoView( src_v, src    , CpuRead);
 | 
			
		||||
	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
 | 
			
		||||
	  for(int s=0;s<Ls;s++){
 | 
			
		||||
	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      tmp =adj(U[mu])*src;
 | 
			
		||||
      tmp =Cshift(tmp,mu+1,-1);
 | 
			
		||||
      {
 | 
			
		||||
	autoView( ref_v, ref, CpuWrite);
 | 
			
		||||
	autoView( tmp_v, tmp, CpuRead);
 | 
			
		||||
	for(int i=0;i<ref_v.size();i++){
 | 
			
		||||
	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    ref = -0.5*ref;
 | 
			
		||||
  }
 | 
			
		||||
  //  dump=1;
 | 
			
		||||
  Dw.Dhop(src,result,1);
 | 
			
		||||
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
  err = ref-result;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
  if((norm2(err)>1.0e-4)){
 | 
			
		||||
/*
 | 
			
		||||
	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
 | 
			
		||||
	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
 | 
			
		||||
	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
 | 
			
		||||
*/
 | 
			
		||||
  }
 | 
			
		||||
  LatticeFermionF src_e (FrbGrid);
 | 
			
		||||
  LatticeFermionF src_o (FrbGrid);
 | 
			
		||||
  LatticeFermionF r_e   (FrbGrid);
 | 
			
		||||
  LatticeFermionF r_o   (FrbGrid);
 | 
			
		||||
  LatticeFermionF r_eo  (FGrid);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
 | 
			
		||||
  pickCheckerboard(Even,src_e,src);
 | 
			
		||||
  pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // S-direction is INNERMOST and takes no part in the parity.
 | 
			
		||||
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
 | 
			
		||||
  if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
  if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
  {
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
    Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
#ifdef CUDA_PROFILE
 | 
			
		||||
      if(i==10) cudaProfilerStart();
 | 
			
		||||
#endif
 | 
			
		||||
      Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
#ifdef CUDA_PROFILE
 | 
			
		||||
      if(i==20) cudaProfilerStop();
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
 | 
			
		||||
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
    double flops=(single_site_flops*volume*ncall)/2.0;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
  Dw.DhopOE(src_e,r_o,DaggerNo);
 | 
			
		||||
  Dw.Dhop  (src  ,result,DaggerNo);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  setCheckerboard(r_eo,r_o);
 | 
			
		||||
  setCheckerboard(r_eo,r_e);
 | 
			
		||||
 | 
			
		||||
  err = r_eo-result;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
  if((norm2(err)>1.0e-4)){
 | 
			
		||||
    /*
 | 
			
		||||
	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
 | 
			
		||||
	std::cout<< "Deo REF\n " <<result  << std::endl;
 | 
			
		||||
	std::cout<< "Deo ERR   \n " << err <<std::endl;
 | 
			
		||||
    */
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  pickCheckerboard(Even,src_e,err);
 | 
			
		||||
  pickCheckerboard(Odd,src_o,err);
 | 
			
		||||
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  assert(norm2(src_e)<1.0e-4);
 | 
			
		||||
  assert(norm2(src_o)<1.0e-4);
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
  exit(0);
 | 
			
		||||
}
 | 
			
		||||
@@ -4,7 +4,7 @@
 | 
			
		||||
#SBATCH -p QZ1J-ICX-PVC
 | 
			
		||||
##SBATCH -p QZ1J-SPR-PVC-2C
 | 
			
		||||
 | 
			
		||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
 | 
			
		||||
#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
 | 
			
		||||
 | 
			
		||||
export NT=8
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -4,7 +4,7 @@
 | 
			
		||||
 | 
			
		||||
#SBATCH -p QZ1J-ICX-PVC
 | 
			
		||||
 | 
			
		||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
 | 
			
		||||
#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
 | 
			
		||||
 | 
			
		||||
export NT=16
 | 
			
		||||
 | 
			
		||||
@@ -19,11 +19,15 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
 | 
			
		||||
export I_MPI_OFFLOAD_CELL=tile
 | 
			
		||||
export EnableImplicitScaling=0
 | 
			
		||||
export EnableWalkerPartition=0
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
 | 
			
		||||
 | 
			
		||||
#mpiexec -launcher ssh -n 1 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log
 | 
			
		||||
for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 | 
			
		||||
do
 | 
			
		||||
mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 1.1.1.2.log$i
 | 
			
		||||
mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 2.1.1.1.log$i 
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -5,10 +5,5 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
 | 
			
		||||
echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if [ $MPI_LOCALRANKID = "0" ] 
 | 
			
		||||
#then
 | 
			
		||||
#  ~psteinbr/build_pti/ze_tracer -c $@
 | 
			
		||||
#  onetrace --chrome-kernel-timeline $@
 | 
			
		||||
#else
 | 
			
		||||
  $@
 | 
			
		||||
#fi
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user