mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Merge branch 'develop' into feature/dirichlet
This commit is contained in:
		@@ -45,7 +45,7 @@ directory
 | 
			
		||||
 //disables nvcc specific warning in json.hpp
 | 
			
		||||
#pragma clang diagnostic ignored "-Wdeprecated-register"
 | 
			
		||||
 | 
			
		||||
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 | 
			
		||||
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 | 
			
		||||
 //disables nvcc specific warning in json.hpp
 | 
			
		||||
#pragma nv_diag_suppress unsigned_compare_with_zero
 | 
			
		||||
#pragma nv_diag_suppress cast_to_qualified_type
 | 
			
		||||
 
 | 
			
		||||
@@ -14,7 +14,7 @@
 | 
			
		||||
/* NVCC save and restore compile environment*/
 | 
			
		||||
#ifdef __NVCC__
 | 
			
		||||
#pragma push
 | 
			
		||||
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 | 
			
		||||
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 | 
			
		||||
#pragma nv_diag_suppress code_is_unreachable
 | 
			
		||||
#else
 | 
			
		||||
#pragma diag_suppress code_is_unreachable
 | 
			
		||||
 
 | 
			
		||||
@@ -97,8 +97,8 @@ private:
 | 
			
		||||
  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
 | 
			
		||||
  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
 | 
			
		||||
 | 
			
		||||
  static void PrintBytes(void);
 | 
			
		||||
 public:
 | 
			
		||||
  static void PrintBytes(void);
 | 
			
		||||
  static void Audit(std::string s);
 | 
			
		||||
  static void Init(void);
 | 
			
		||||
  static void InitMessage(void);
 | 
			
		||||
@@ -119,6 +119,8 @@ private:
 | 
			
		||||
  static uint64_t     DeviceToHostBytes;
 | 
			
		||||
  static uint64_t     HostToDeviceXfer;
 | 
			
		||||
  static uint64_t     DeviceToHostXfer;
 | 
			
		||||
  static uint64_t     DeviceEvictions;
 | 
			
		||||
  static uint64_t     DeviceDestroy;
 | 
			
		||||
 
 | 
			
		||||
 private:
 | 
			
		||||
#ifndef GRID_UVM
 | 
			
		||||
@@ -176,6 +178,7 @@ private:
 | 
			
		||||
 | 
			
		||||
 public:
 | 
			
		||||
  static void Print(void);
 | 
			
		||||
  static void PrintAll(void);
 | 
			
		||||
  static void PrintState( void* CpuPtr);
 | 
			
		||||
  static int   isOpen   (void* CpuPtr);
 | 
			
		||||
  static void  ViewClose(void* CpuPtr,ViewMode mode);
 | 
			
		||||
 
 | 
			
		||||
@@ -28,6 +28,8 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostBytes;
 | 
			
		||||
uint64_t  MemoryManager::HostToDeviceXfer;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostXfer;
 | 
			
		||||
uint64_t  MemoryManager::DeviceEvictions;
 | 
			
		||||
uint64_t  MemoryManager::DeviceDestroy;
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////
 | 
			
		||||
// Priority ordering for unlocked entries
 | 
			
		||||
@@ -115,8 +117,10 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
 | 
			
		||||
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
 | 
			
		||||
  if(AccCache.AccPtr) {
 | 
			
		||||
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
 | 
			
		||||
    DeviceDestroy++;
 | 
			
		||||
    DeviceBytes   -=AccCache.bytes;
 | 
			
		||||
    LRUremove(AccCache);
 | 
			
		||||
    AccCache.AccPtr=(uint64_t) NULL;
 | 
			
		||||
    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
 | 
			
		||||
  }
 | 
			
		||||
  uint64_t CpuPtr = AccCache.CpuPtr;
 | 
			
		||||
@@ -126,8 +130,14 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
 | 
			
		||||
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Make CPU consistent, remove from Accelerator, remove entry
 | 
			
		||||
  // Cannot be locked. If allocated must be in LRU pool.
 | 
			
		||||
  // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
 | 
			
		||||
  // Cannot be acclocked. If allocated must be in LRU pool.
 | 
			
		||||
  //
 | 
			
		||||
  // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
 | 
			
		||||
  //                          and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
 | 
			
		||||
  //                          but there is a weakness where CpuLock entries are attempted for erase
 | 
			
		||||
  //                          Take these OUT LRU queue when CPU locked?
 | 
			
		||||
  //                          Cannot take out the table as cpuLock data is important.
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  assert(AccCache.state!=Empty);
 | 
			
		||||
  
 | 
			
		||||
@@ -139,15 +149,17 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
 | 
			
		||||
  if(AccCache.state==AccDirty) {
 | 
			
		||||
    Flush(AccCache);
 | 
			
		||||
  }
 | 
			
		||||
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
 | 
			
		||||
  if(AccCache.AccPtr) {
 | 
			
		||||
    AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
 | 
			
		||||
    DeviceBytes   -=AccCache.bytes;
 | 
			
		||||
    LRUremove(AccCache);
 | 
			
		||||
    AccCache.AccPtr=(uint64_t)NULL;
 | 
			
		||||
    AccCache.state=CpuDirty; // CPU primary now
 | 
			
		||||
    DeviceBytes   -=AccCache.bytes;
 | 
			
		||||
    dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
 | 
			
		||||
  }
 | 
			
		||||
  uint64_t CpuPtr = AccCache.CpuPtr;
 | 
			
		||||
  EntryErase(CpuPtr);
 | 
			
		||||
  //  uint64_t CpuPtr = AccCache.CpuPtr;
 | 
			
		||||
  DeviceEvictions++;
 | 
			
		||||
  //  EntryErase(CpuPtr);
 | 
			
		||||
}
 | 
			
		||||
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
 | 
			
		||||
{
 | 
			
		||||
@@ -221,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::EvictVictims(uint64_t bytes)
 | 
			
		||||
{
 | 
			
		||||
  assert(bytes<DeviceMaxBytes);
 | 
			
		||||
  while(bytes+DeviceLRUBytes > DeviceMaxBytes){
 | 
			
		||||
    if ( DeviceLRUBytes > 0){
 | 
			
		||||
      assert(LRU.size()>0);
 | 
			
		||||
      uint64_t victim = LRU.back();
 | 
			
		||||
      uint64_t victim = LRU.back(); // From the LRU
 | 
			
		||||
      auto AccCacheIterator = EntryLookup(victim);
 | 
			
		||||
      auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
      Evict(AccCache);
 | 
			
		||||
    } else {
 | 
			
		||||
      return;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -322,7 +337,8 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // If view is opened on device remove from LRU
 | 
			
		||||
  assert(AccCache.accLock>0);
 | 
			
		||||
  // If view is opened on device must remove from LRU
 | 
			
		||||
  if(AccCache.LRU_valid==1){
 | 
			
		||||
    // must possibly remove from LRU as now locked on GPU
 | 
			
		||||
    dprintf("AccCache entry removed from LRU \n");
 | 
			
		||||
@@ -388,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
 | 
			
		||||
  auto AccCacheIterator = EntryLookup(CpuPtr);
 | 
			
		||||
  auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
 | 
			
		||||
  if (!AccCache.AccPtr) {
 | 
			
		||||
     EvictVictims(bytes);
 | 
			
		||||
  }
 | 
			
		||||
  // CPU doesn't need to free space
 | 
			
		||||
  //  if (!AccCache.AccPtr) {
 | 
			
		||||
  //    EvictVictims(bytes);
 | 
			
		||||
  //  }
 | 
			
		||||
 | 
			
		||||
  assert((mode==CpuRead)||(mode==CpuWrite));
 | 
			
		||||
  assert(AccCache.accLock==0);  // Programming error
 | 
			
		||||
@@ -444,20 +461,28 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 | 
			
		||||
void  MemoryManager::Print(void)
 | 
			
		||||
{
 | 
			
		||||
  PrintBytes();
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceBytes   << " bytes allocated on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device       " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << HostToDeviceXfer << " transfers        to   device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceToHostXfer << " transfers        from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Memory Manager                             " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceBytes   << " bytes allocated on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device       " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << HostToDeviceXfer << " transfers        to   device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceToHostXfer << " transfers        from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to   device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceEvictions  << " Evictions from device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << DeviceDestroy    << " Destroyed vectors on device " << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
}
 | 
			
		||||
void  MemoryManager::PrintAll(void)
 | 
			
		||||
{
 | 
			
		||||
  Print();
 | 
			
		||||
  std::cout << GridLogMessage << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
 | 
			
		||||
    auto &AccCache = it->second;
 | 
			
		||||
    
 | 
			
		||||
@@ -467,13 +492,13 @@ void  MemoryManager::Print(void)
 | 
			
		||||
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
 | 
			
		||||
    if ( AccCache.state==Consistent)str = std::string("Consistent");
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
	      << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 | 
			
		||||
	      << "\t" << AccCache.cpuLock
 | 
			
		||||
	      << "\t" << AccCache.accLock
 | 
			
		||||
	      << "\t" << AccCache.LRU_valid<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
int   MemoryManager::isOpen   (void* _CpuPtr) 
 | 
			
		||||
@@ -489,6 +514,25 @@ int   MemoryManager::isOpen   (void* _CpuPtr)
 | 
			
		||||
}
 | 
			
		||||
void MemoryManager::Audit(std::string s)
 | 
			
		||||
{
 | 
			
		||||
  uint64_t CpuBytes=0;
 | 
			
		||||
  uint64_t AccBytes=0;
 | 
			
		||||
  uint64_t LruBytes1=0;
 | 
			
		||||
  uint64_t LruBytes2=0;
 | 
			
		||||
  uint64_t LruCnt=0;
 | 
			
		||||
  uint64_t LockedBytes=0;
 | 
			
		||||
  
 | 
			
		||||
  std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
 | 
			
		||||
  for(auto it=LRU.begin();it!=LRU.end();it++){
 | 
			
		||||
    uint64_t cpuPtr = *it;
 | 
			
		||||
    assert(EntryPresent(cpuPtr));
 | 
			
		||||
    auto AccCacheIterator = EntryLookup(cpuPtr);
 | 
			
		||||
    auto & AccCache = AccCacheIterator->second;
 | 
			
		||||
    LruBytes2+=AccCache.bytes;
 | 
			
		||||
    assert(AccCache.LRU_valid==1);
 | 
			
		||||
    assert(AccCache.LRU_entry==it);
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
 | 
			
		||||
    auto &AccCache = it->second;
 | 
			
		||||
    
 | 
			
		||||
@@ -498,7 +542,14 @@ void MemoryManager::Audit(std::string s)
 | 
			
		||||
    if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
 | 
			
		||||
    if ( AccCache.state==Consistent)str = std::string("Consistent");
 | 
			
		||||
 | 
			
		||||
    CpuBytes+=AccCache.bytes;
 | 
			
		||||
    if( AccCache.AccPtr )    AccBytes+=AccCache.bytes;
 | 
			
		||||
    if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
 | 
			
		||||
    if( AccCache.LRU_valid ) LruCnt++;
 | 
			
		||||
    
 | 
			
		||||
    if ( AccCache.cpuLock || AccCache.accLock ) {
 | 
			
		||||
      assert(AccCache.LRU_valid==0);
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
		<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 | 
			
		||||
		<< "\t cpuLock  " << AccCache.cpuLock
 | 
			
		||||
@@ -509,6 +560,15 @@ void MemoryManager::Audit(std::string s)
 | 
			
		||||
    assert( AccCache.cpuLock== 0 ) ;
 | 
			
		||||
    assert( AccCache.accLock== 0 ) ;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
 | 
			
		||||
  assert(LruBytes1==LruBytes2);
 | 
			
		||||
  assert(LruBytes1==DeviceLRUBytes);
 | 
			
		||||
  std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
 | 
			
		||||
  assert(AccBytes==DeviceBytes);
 | 
			
		||||
  std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
 | 
			
		||||
  assert(LruCnt == LRU.size());
 | 
			
		||||
  std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void MemoryManager::PrintState(void* _CpuPtr)
 | 
			
		||||
@@ -526,8 +586,8 @@ void MemoryManager::PrintState(void* _CpuPtr)
 | 
			
		||||
    if ( AccCache.state==EvictNext) str = std::string("EvictNext");
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
    << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 | 
			
		||||
    std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
 | 
			
		||||
    << "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
 | 
			
		||||
    << "\t" << AccCache.cpuLock
 | 
			
		||||
    << "\t" << AccCache.accLock
 | 
			
		||||
    << "\t" << AccCache.LRU_valid<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -12,6 +12,8 @@ uint64_t  MemoryManager::HostToDeviceBytes;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostBytes;
 | 
			
		||||
uint64_t  MemoryManager::HostToDeviceXfer;
 | 
			
		||||
uint64_t  MemoryManager::DeviceToHostXfer;
 | 
			
		||||
uint64_t  MemoryManager::DeviceEvictions;
 | 
			
		||||
uint64_t  MemoryManager::DeviceDestroy;
 | 
			
		||||
 | 
			
		||||
void  MemoryManager::Audit(std::string s){};
 | 
			
		||||
void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 | 
			
		||||
@@ -22,6 +24,7 @@ void  MemoryManager::PrintState(void* CpuPtr)
 | 
			
		||||
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
 | 
			
		||||
};
 | 
			
		||||
void  MemoryManager::Print(void){};
 | 
			
		||||
void  MemoryManager::PrintAll(void){};
 | 
			
		||||
void  MemoryManager::NotifyDeletion(void *ptr){};
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 
 | 
			
		||||
@@ -248,7 +248,7 @@ public:
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
  // user defined constructor
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
  Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { 
 | 
			
		||||
  Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) { 
 | 
			
		||||
    this->_grid = grid;
 | 
			
		||||
    resize(this->_grid->oSites());
 | 
			
		||||
    assert((((uint64_t)&this->_odata[0])&0xF) ==0);
 | 
			
		||||
@@ -291,8 +291,8 @@ public:
 | 
			
		||||
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
 | 
			
		||||
    conformable(*this,r);
 | 
			
		||||
    this->checkerboard = r.Checkerboard();
 | 
			
		||||
    auto me =   View(AcceleratorWriteDiscard);
 | 
			
		||||
    auto him= r.View(AcceleratorRead);
 | 
			
		||||
    auto me =   View(AcceleratorWriteDiscard);
 | 
			
		||||
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 | 
			
		||||
      coalescedWrite(me[ss],him(ss));
 | 
			
		||||
    });
 | 
			
		||||
@@ -306,8 +306,8 @@ public:
 | 
			
		||||
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
 | 
			
		||||
    this->checkerboard = r.Checkerboard();
 | 
			
		||||
    conformable(*this,r);
 | 
			
		||||
    auto me =   View(AcceleratorWriteDiscard);
 | 
			
		||||
    auto him= r.View(AcceleratorRead);
 | 
			
		||||
    auto me =   View(AcceleratorWriteDiscard);
 | 
			
		||||
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
 | 
			
		||||
      coalescedWrite(me[ss],him(ss));
 | 
			
		||||
    });
 | 
			
		||||
 
 | 
			
		||||
@@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
 | 
			
		||||
#include <Grid/lattice/Lattice_reduction_gpu.h>
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(GRID_SYCL)
 | 
			
		||||
#include <Grid/lattice/Lattice_reduction_sycl.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
@@ -124,7 +127,7 @@ inline Double max(const Double *arg, Integer osites)
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
 | 
			
		||||
  return sum_gpu(arg,osites);
 | 
			
		||||
#else
 | 
			
		||||
  return sum_cpu(arg,osites);
 | 
			
		||||
@@ -133,7 +136,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
 | 
			
		||||
  return sumD_gpu(arg,osites);
 | 
			
		||||
#else
 | 
			
		||||
  return sumD_cpu(arg,osites);
 | 
			
		||||
@@ -142,7 +145,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
 | 
			
		||||
  return sumD_gpu_large(arg,osites);
 | 
			
		||||
#else
 | 
			
		||||
  return sumD_cpu(arg,osites);
 | 
			
		||||
@@ -152,13 +155,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 | 
			
		||||
{
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
 | 
			
		||||
  autoView( arg_v, arg, AcceleratorRead);
 | 
			
		||||
  Integer osites = arg.Grid()->oSites();
 | 
			
		||||
  auto ssum= sum_gpu(&arg_v[0],osites);
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
 | 
			
		||||
  typename vobj::scalar_object ssum;
 | 
			
		||||
  autoView( arg_v, arg, AcceleratorRead);
 | 
			
		||||
  ssum= sum_gpu(&arg_v[0],osites);
 | 
			
		||||
#else
 | 
			
		||||
  autoView(arg_v, arg, CpuRead);
 | 
			
		||||
  Integer osites = arg.Grid()->oSites();
 | 
			
		||||
  auto ssum= sum_cpu(&arg_v[0],osites);
 | 
			
		||||
#endif  
 | 
			
		||||
  arg.Grid()->GlobalSum(ssum);
 | 
			
		||||
@@ -168,7 +171,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 | 
			
		||||
{
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
 | 
			
		||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
 | 
			
		||||
  autoView( arg_v, arg, AcceleratorRead);
 | 
			
		||||
  Integer osites = arg.Grid()->oSites();
 | 
			
		||||
  auto ssum= sum_gpu_large(&arg_v[0],osites);
 | 
			
		||||
@@ -232,11 +235,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
 | 
			
		||||
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
 | 
			
		||||
  Vector<inner_t> inner_tmp(sites);
 | 
			
		||||
  auto inner_tmp_v = &inner_tmp[0];
 | 
			
		||||
    
 | 
			
		||||
  {
 | 
			
		||||
    autoView( left_v , left, AcceleratorRead);
 | 
			
		||||
    autoView( right_v,right, AcceleratorRead);
 | 
			
		||||
 | 
			
		||||
    // This code could read coalesce
 | 
			
		||||
    // GPU - SIMT lane compliance...
 | 
			
		||||
    accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
	auto x_l = left_v(ss);
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										125
									
								
								Grid/lattice/Lattice_reduction_sycl.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										125
									
								
								Grid/lattice/Lattice_reduction_sycl.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,125 @@
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Possibly promote to double and sum
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_objectD sobjD;
 | 
			
		||||
  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
 | 
			
		||||
  sobj identity; zeroit(identity);
 | 
			
		||||
  sobj ret ; 
 | 
			
		||||
 | 
			
		||||
  Integer nsimd= vobj::Nsimd();
 | 
			
		||||
  
 | 
			
		||||
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
 | 
			
		||||
     cgh.parallel_for(cl::sycl::range<1>{osites},
 | 
			
		||||
		      Reduction,
 | 
			
		||||
		      [=] (cl::sycl::id<1> item, auto &sum) {
 | 
			
		||||
      auto osite   = item[0];
 | 
			
		||||
      sum +=Reduce(lat[osite]);
 | 
			
		||||
     });
 | 
			
		||||
   });
 | 
			
		||||
  theGridAccelerator->wait();
 | 
			
		||||
  ret = mysum[0];
 | 
			
		||||
  free(mysum,*theGridAccelerator);
 | 
			
		||||
  sobjD dret; convertType(dret,ret);
 | 
			
		||||
  return dret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
  return sumD_gpu_tensor(lat,osites);
 | 
			
		||||
}
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
  return sumD_gpu_large(lat,osites);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
  return sumD_gpu_large(lat,osites);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Return as same precision as input performing reduction in double precision though
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) 
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  sobj result;
 | 
			
		||||
  result = sumD_gpu(lat,osites);
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  sobj result;
 | 
			
		||||
  result = sumD_gpu_large(lat,osites);
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
 | 
			
		||||
{
 | 
			
		||||
  Double sumResult; zeroit(sumResult);
 | 
			
		||||
  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
 | 
			
		||||
  Double identity;  zeroit(identity);
 | 
			
		||||
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
 | 
			
		||||
     cgh.parallel_for(cl::sycl::range<1>{L},
 | 
			
		||||
		      Reduction,
 | 
			
		||||
		      [=] (cl::sycl::id<1> index, auto &sum) {
 | 
			
		||||
	 sum +=vec[index];
 | 
			
		||||
     });
 | 
			
		||||
   });
 | 
			
		||||
  theGridAccelerator->wait();
 | 
			
		||||
  Double ret = d_sum[0];
 | 
			
		||||
  free(d_sum,*theGridAccelerator);
 | 
			
		||||
  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::vector_type  vector;
 | 
			
		||||
  typedef typename vobj::scalar_type  scalar;
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_typeD scalarD;
 | 
			
		||||
  typedef typename vobj::scalar_objectD sobjD;
 | 
			
		||||
 | 
			
		||||
  sobjD ret;
 | 
			
		||||
  scalarD *ret_p = (scalarD *)&ret;
 | 
			
		||||
  
 | 
			
		||||
  const int nsimd = vobj::Nsimd();
 | 
			
		||||
  const int words = sizeof(vobj)/sizeof(vector);
 | 
			
		||||
 | 
			
		||||
  Vector<scalar> buffer(osites*nsimd);
 | 
			
		||||
  scalar *buf = &buffer[0];
 | 
			
		||||
  vector *dat = (vector *)lat;
 | 
			
		||||
 | 
			
		||||
  for(int w=0;w<words;w++) {
 | 
			
		||||
 | 
			
		||||
    accelerator_for(ss,osites,nsimd,{
 | 
			
		||||
	int lane = acceleratorSIMTlane(nsimd);
 | 
			
		||||
	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
 | 
			
		||||
    });
 | 
			
		||||
    //Precision change at this point is to late to gain precision
 | 
			
		||||
    ret_p[w] = svm_reduce(buf,nsimd*osites);
 | 
			
		||||
  }
 | 
			
		||||
  return ret;
 | 
			
		||||
}
 | 
			
		||||
*/
 | 
			
		||||
@@ -30,6 +30,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifndef GRID_PERFCOUNT_H
 | 
			
		||||
#define GRID_PERFCOUNT_H
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#ifndef __SSC_START
 | 
			
		||||
#define __SSC_START
 | 
			
		||||
#define __SSC_STOP
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include <sys/time.h>
 | 
			
		||||
#include <ctime>
 | 
			
		||||
#include <chrono>
 | 
			
		||||
 
 | 
			
		||||
@@ -16,7 +16,7 @@
 | 
			
		||||
 | 
			
		||||
#ifdef __NVCC__
 | 
			
		||||
#pragma push
 | 
			
		||||
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
 | 
			
		||||
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 | 
			
		||||
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 | 
			
		||||
#else
 | 
			
		||||
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
 | 
			
		||||
 
 | 
			
		||||
@@ -507,9 +507,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 | 
			
		||||
// Fermion <-> propagator assignements
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
//template <class Prop, class Ferm>
 | 
			
		||||
#define FAST_FERM_TO_PROP
 | 
			
		||||
template <class Fimpl>
 | 
			
		||||
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 | 
			
		||||
{
 | 
			
		||||
#ifdef FAST_FERM_TO_PROP
 | 
			
		||||
  autoView(p_v,p,CpuWrite);
 | 
			
		||||
  autoView(f_v,f,CpuRead);
 | 
			
		||||
  thread_for(idx,p_v.oSites(),{
 | 
			
		||||
      for(int ss = 0; ss < Ns; ++ss) {
 | 
			
		||||
      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 | 
			
		||||
	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
 | 
			
		||||
      }}
 | 
			
		||||
    });
 | 
			
		||||
#else
 | 
			
		||||
  for(int j = 0; j < Ns; ++j)
 | 
			
		||||
    {
 | 
			
		||||
      auto pjs = peekSpin(p, j, s);
 | 
			
		||||
@@ -521,12 +532,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
 | 
			
		||||
	}
 | 
			
		||||
      pokeSpin(p, pjs, j, s);
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
    
 | 
			
		||||
//template <class Prop, class Ferm>
 | 
			
		||||
template <class Fimpl>
 | 
			
		||||
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 | 
			
		||||
{
 | 
			
		||||
#ifdef FAST_FERM_TO_PROP
 | 
			
		||||
  autoView(p_v,p,CpuRead);
 | 
			
		||||
  autoView(f_v,f,CpuWrite);
 | 
			
		||||
  thread_for(idx,p_v.oSites(),{
 | 
			
		||||
      for(int ss = 0; ss < Ns; ++ss) {
 | 
			
		||||
      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 | 
			
		||||
	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
 | 
			
		||||
      }}
 | 
			
		||||
    });
 | 
			
		||||
#else
 | 
			
		||||
  for(int j = 0; j < Ns; ++j)
 | 
			
		||||
    {
 | 
			
		||||
      auto pjs = peekSpin(p, j, s);
 | 
			
		||||
@@ -538,6 +560,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
 | 
			
		||||
	}
 | 
			
		||||
      pokeSpin(f, fj, j);
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
    
 | 
			
		||||
//////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -205,15 +205,18 @@ public:
 | 
			
		||||
  typedef WilsonCloverHelpers<Impl> Helpers;
 | 
			
		||||
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
 | 
			
		||||
 | 
			
		||||
  static void MassTerm(CloverField& Clover, RealD diag_mass) {
 | 
			
		||||
  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
 | 
			
		||||
    Clover += diag_mass;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
 | 
			
		||||
                          CloverTriangleField& Triangle,
 | 
			
		||||
                          RealD csw_t, RealD diag_mass) {
 | 
			
		||||
  static void InvertClover(CloverField& InvClover,
 | 
			
		||||
                            const CloverDiagonalField& diagonal,
 | 
			
		||||
                            const CloverTriangleField& triangle,
 | 
			
		||||
                            CloverDiagonalField&       diagonalInv,
 | 
			
		||||
                            CloverTriangleField&       triangleInv,
 | 
			
		||||
                            bool fixedBoundaries) {
 | 
			
		||||
 | 
			
		||||
    // Do nothing
 | 
			
		||||
    CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // TODO: implement Cmunu for better performances with compact layout, but don't do it
 | 
			
		||||
@@ -238,9 +241,17 @@ public:
 | 
			
		||||
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
 | 
			
		||||
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
 | 
			
		||||
 | 
			
		||||
  static void MassTerm(CloverField& Clover, RealD diag_mass) {
 | 
			
		||||
    // do nothing!
 | 
			
		||||
    // mass term is multiplied to exp(Clover) below
 | 
			
		||||
  // Can this be avoided?
 | 
			
		||||
  static void IdentityTimesC(const CloverField& in, RealD c) {
 | 
			
		||||
    int DimRep = Impl::Dimension;
 | 
			
		||||
 | 
			
		||||
    autoView(in_v, in, AcceleratorWrite);
 | 
			
		||||
 | 
			
		||||
    accelerator_for(ss, in.Grid()->oSites(), 1, {
 | 
			
		||||
      for (int sa=0; sa<Ns; sa++)
 | 
			
		||||
        for (int ca=0; ca<DimRep; ca++)
 | 
			
		||||
          in_v[ss]()(sa,sa)(ca,ca) = c;
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static int getNMAX(RealD prec, RealD R) {
 | 
			
		||||
@@ -255,175 +266,62 @@ public:
 | 
			
		||||
    return NMAX;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
 | 
			
		||||
  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
 | 
			
		||||
  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
 | 
			
		||||
  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
 | 
			
		||||
 | 
			
		||||
  static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
 | 
			
		||||
  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
 | 
			
		||||
 | 
			
		||||
  	  typedef iMatrix<ComplexD,6> mat;
 | 
			
		||||
    GridBase* grid = Clover.Grid();
 | 
			
		||||
    CloverField ExpClover(grid);
 | 
			
		||||
 | 
			
		||||
  	  RealD qn[6];
 | 
			
		||||
  	  RealD qnold[6];
 | 
			
		||||
  	  RealD p[5];
 | 
			
		||||
  	  RealD trA2, trA3, trA4;
 | 
			
		||||
    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
 | 
			
		||||
 | 
			
		||||
  	  mat A2, A3, A4, A5;
 | 
			
		||||
  	  A2 = alpha * alpha * arg * arg;
 | 
			
		||||
  	  A3 = alpha * arg * A2;
 | 
			
		||||
  	  A4 = A2 * A2;
 | 
			
		||||
  	  A5 = A2 * A3;
 | 
			
		||||
    Clover *= (1.0/diag_mass);
 | 
			
		||||
 | 
			
		||||
  	  trA2 = toReal( trace(A2) );
 | 
			
		||||
  	  trA3 = toReal( trace(A3) );
 | 
			
		||||
  	  trA4 = toReal( trace(A4));
 | 
			
		||||
 | 
			
		||||
  	  p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
 | 
			
		||||
  	  p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
 | 
			
		||||
  	  p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
 | 
			
		||||
  	  p[3] = trA3 / 3.0;
 | 
			
		||||
  	  p[4] = 0.5 * trA2;
 | 
			
		||||
 | 
			
		||||
  	  qnold[0] = cN[Niter];
 | 
			
		||||
  	  qnold[1] = 0.0;
 | 
			
		||||
  	  qnold[2] = 0.0;
 | 
			
		||||
  	  qnold[3] = 0.0;
 | 
			
		||||
  	  qnold[4] = 0.0;
 | 
			
		||||
  	  qnold[5] = 0.0;
 | 
			
		||||
 | 
			
		||||
  	  for(int i = Niter-1; i >= 0; i--)
 | 
			
		||||
  	  {
 | 
			
		||||
  	   qn[0] = p[0] * qnold[5] + cN[i];
 | 
			
		||||
  	   qn[1] = p[1] * qnold[5] + qnold[0];
 | 
			
		||||
  	   qn[2] = p[2] * qnold[5] + qnold[1];
 | 
			
		||||
  	   qn[3] = p[3] * qnold[5] + qnold[2];
 | 
			
		||||
  	   qn[4] = p[4] * qnold[5] + qnold[3];
 | 
			
		||||
  	   qn[5] = qnold[4];
 | 
			
		||||
 | 
			
		||||
  	   qnold[0] = qn[0];
 | 
			
		||||
  	   qnold[1] = qn[1];
 | 
			
		||||
  	   qnold[2] = qn[2];
 | 
			
		||||
  	   qnold[3] = qn[3];
 | 
			
		||||
  	   qnold[4] = qn[4];
 | 
			
		||||
  	   qnold[5] = qn[5];
 | 
			
		||||
  	  }
 | 
			
		||||
 | 
			
		||||
  	  mat unit(1.0);
 | 
			
		||||
 | 
			
		||||
  	  dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
 | 
			
		||||
 | 
			
		||||
    GridBase* grid = Diagonal.Grid();
 | 
			
		||||
    int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
 | 
			
		||||
 | 
			
		||||
    //
 | 
			
		||||
    // Implementation completely in Daniel's layout
 | 
			
		||||
    //
 | 
			
		||||
 | 
			
		||||
    // Taylor expansion with Cayley-Hamilton recursion
 | 
			
		||||
    // underlying Horner scheme as above
 | 
			
		||||
    // Taylor expansion, slow but generic
 | 
			
		||||
    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
 | 
			
		||||
    // qN = cN
 | 
			
		||||
    // qn = cn + qn+1 X
 | 
			
		||||
    std::vector<RealD> cn(NMAX+1);
 | 
			
		||||
    cn[0] = 1.0;
 | 
			
		||||
    for (int i=1; i<=NMAX; i++){
 | 
			
		||||
    for (int i=1; i<=NMAX; i++)
 | 
			
		||||
      cn[i] = cn[i-1] / RealD(i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
      // Taken over from Daniel's implementation
 | 
			
		||||
      conformable(Diagonal, Triangle);
 | 
			
		||||
    ExpClover = Zero();
 | 
			
		||||
    IdentityTimesC(ExpClover, cn[NMAX]);
 | 
			
		||||
    for (int i=NMAX-1; i>=0; i--)
 | 
			
		||||
      ExpClover = ExpClover * Clover + cn[i];
 | 
			
		||||
 | 
			
		||||
      long lsites = grid->lSites();
 | 
			
		||||
    {
 | 
			
		||||
      typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
 | 
			
		||||
      typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
 | 
			
		||||
      typedef iMatrix<ComplexD,6> mat;
 | 
			
		||||
    // prepare inverse
 | 
			
		||||
    CloverInv = (-1.0)*Clover;
 | 
			
		||||
 | 
			
		||||
      autoView(diagonal_v,  Diagonal,  CpuRead);
 | 
			
		||||
      autoView(triangle_v,  Triangle,  CpuRead);
 | 
			
		||||
      autoView(diagonalExp_v, Diagonal, CpuWrite);
 | 
			
		||||
      autoView(triangleExp_v, Triangle, CpuWrite);
 | 
			
		||||
    Clover = ExpClover * diag_mass;
 | 
			
		||||
 | 
			
		||||
      thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
 | 
			
		||||
    ExpClover = Zero();
 | 
			
		||||
    IdentityTimesC(ExpClover, cn[NMAX]);
 | 
			
		||||
    for (int i=NMAX-1; i>=0; i--)
 | 
			
		||||
      ExpClover = ExpClover * CloverInv + cn[i];
 | 
			
		||||
 | 
			
		||||
    	  mat srcCloverOpUL(0.0); // upper left block
 | 
			
		||||
    	  mat srcCloverOpLR(0.0); // lower right block
 | 
			
		||||
    	  mat ExpCloverOp;
 | 
			
		||||
    CloverInv = ExpClover * (1.0/diag_mass);
 | 
			
		||||
 | 
			
		||||
        scalar_object_diagonal diagonal_tmp     = Zero();
 | 
			
		||||
        scalar_object_diagonal diagonal_exp_tmp = Zero();
 | 
			
		||||
        scalar_object_triangle triangle_tmp     = Zero();
 | 
			
		||||
        scalar_object_triangle triangle_exp_tmp = Zero();
 | 
			
		||||
 | 
			
		||||
        Coordinate lcoor;
 | 
			
		||||
        grid->LocalIndexToLocalCoor(site, lcoor);
 | 
			
		||||
 | 
			
		||||
        peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
 | 
			
		||||
        peekLocalSite(triangle_tmp, triangle_v, lcoor);
 | 
			
		||||
 | 
			
		||||
        int block;
 | 
			
		||||
        block = 0;
 | 
			
		||||
        for(int i = 0; i < 6; i++){
 | 
			
		||||
        	for(int j = 0; j < 6; j++){
 | 
			
		||||
        		if (i == j){
 | 
			
		||||
        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
 | 
			
		||||
        		}
 | 
			
		||||
        		else{
 | 
			
		||||
        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
 | 
			
		||||
        		}
 | 
			
		||||
        	}
 | 
			
		||||
        }
 | 
			
		||||
        block = 1;
 | 
			
		||||
        for(int i = 0; i < 6; i++){
 | 
			
		||||
          	for(int j = 0; j < 6; j++){
 | 
			
		||||
           		if (i == j){
 | 
			
		||||
           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
 | 
			
		||||
           		}
 | 
			
		||||
           		else{
 | 
			
		||||
           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
 | 
			
		||||
           		}
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // exp(Clover)
 | 
			
		||||
 | 
			
		||||
        ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
 | 
			
		||||
 | 
			
		||||
        block = 0;
 | 
			
		||||
        for(int i = 0; i < 6; i++){
 | 
			
		||||
        	for(int j = 0; j < 6; j++){
 | 
			
		||||
            	if (i == j){
 | 
			
		||||
            		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
 | 
			
		||||
            	}
 | 
			
		||||
            	else if(i < j){
 | 
			
		||||
            		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
 | 
			
		||||
            	}
 | 
			
		||||
           	}
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
 | 
			
		||||
 | 
			
		||||
        block = 1;
 | 
			
		||||
        for(int i = 0; i < 6; i++){
 | 
			
		||||
        	for(int j = 0; j < 6; j++){
 | 
			
		||||
              	if (i == j){
 | 
			
		||||
              		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
 | 
			
		||||
               	}
 | 
			
		||||
               	else if(i < j){
 | 
			
		||||
               		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
 | 
			
		||||
               	}
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
 | 
			
		||||
        pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
 | 
			
		||||
      });
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    Diagonal *= diag_mass;
 | 
			
		||||
    Triangle *= diag_mass;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static void InvertClover(CloverField& InvClover,
 | 
			
		||||
                            const CloverDiagonalField& diagonal,
 | 
			
		||||
                            const CloverTriangleField& triangle,
 | 
			
		||||
                            CloverDiagonalField&       diagonalInv,
 | 
			
		||||
                            CloverTriangleField&       triangleInv,
 | 
			
		||||
                            bool fixedBoundaries) {
 | 
			
		||||
 | 
			
		||||
    if (fixedBoundaries)
 | 
			
		||||
    {
 | 
			
		||||
      CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
      CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
 | 
			
		||||
    assert(0);
 | 
			
		||||
 
 | 
			
		||||
@@ -225,7 +225,7 @@ public:
 | 
			
		||||
  RealD csw_t;
 | 
			
		||||
  RealD cF;
 | 
			
		||||
 | 
			
		||||
  bool open_boundaries;
 | 
			
		||||
  bool fixedBoundaries;
 | 
			
		||||
 | 
			
		||||
  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
 | 
			
		||||
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
 | 
			
		||||
 
 | 
			
		||||
@@ -320,7 +320,7 @@ public:
 | 
			
		||||
    typedef decltype(coalescedRead(in0))    sobj;
 | 
			
		||||
    typedef decltype(coalescedRead(out0)) hsobj;
 | 
			
		||||
 | 
			
		||||
    unsigned int Nsimd = vobj::Nsimd();
 | 
			
		||||
    constexpr unsigned int Nsimd = vobj::Nsimd();
 | 
			
		||||
    unsigned int mask = Nsimd >> (type + 1);
 | 
			
		||||
    int lane = acceleratorSIMTlane(Nsimd);
 | 
			
		||||
    int j0 = lane &(~mask); // inner coor zero
 | 
			
		||||
 
 | 
			
		||||
@@ -48,7 +48,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
 | 
			
		||||
  , csw_r(_csw_r)
 | 
			
		||||
  , csw_t(_csw_t)
 | 
			
		||||
  , cF(_cF)
 | 
			
		||||
  , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
 | 
			
		||||
  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
 | 
			
		||||
  , Diagonal(&Fgrid),        Triangle(&Fgrid)
 | 
			
		||||
  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
 | 
			
		||||
  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
 | 
			
		||||
@@ -67,7 +67,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
 | 
			
		||||
    csw_r /= clover_anisotropy.xi_0;
 | 
			
		||||
 | 
			
		||||
  ImportGauge(_Umu);
 | 
			
		||||
  if (open_boundaries) {
 | 
			
		||||
  if (fixedBoundaries) {
 | 
			
		||||
    this->BoundaryMaskEven.Checkerboard() = Even;
 | 
			
		||||
    this->BoundaryMaskOdd.Checkerboard() = Odd;
 | 
			
		||||
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
 | 
			
		||||
@@ -77,31 +77,31 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
 | 
			
		||||
  WilsonBase::Dhop(in, out, dag);
 | 
			
		||||
  if(open_boundaries) ApplyBoundaryMask(out);
 | 
			
		||||
  if(fixedBoundaries) ApplyBoundaryMask(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
 | 
			
		||||
  WilsonBase::DhopOE(in, out, dag);
 | 
			
		||||
  if(open_boundaries) ApplyBoundaryMask(out);
 | 
			
		||||
  if(fixedBoundaries) ApplyBoundaryMask(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
 | 
			
		||||
  WilsonBase::DhopEO(in, out, dag);
 | 
			
		||||
  if(open_boundaries) ApplyBoundaryMask(out);
 | 
			
		||||
  if(fixedBoundaries) ApplyBoundaryMask(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
 | 
			
		||||
  WilsonBase::DhopDir(in, out, dir, disp);
 | 
			
		||||
  if(this->open_boundaries) ApplyBoundaryMask(out);
 | 
			
		||||
  if(this->fixedBoundaries) ApplyBoundaryMask(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
 | 
			
		||||
  WilsonBase::DhopDirAll(in, out);
 | 
			
		||||
  if(this->open_boundaries) {
 | 
			
		||||
  if(this->fixedBoundaries) {
 | 
			
		||||
    for(auto& o : out) ApplyBoundaryMask(o);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -112,7 +112,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in,
 | 
			
		||||
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
 | 
			
		||||
  Mooee(in, Tmp);
 | 
			
		||||
  axpy(out, 1.0, out, Tmp);
 | 
			
		||||
  if(open_boundaries) ApplyBoundaryMask(out);
 | 
			
		||||
  if(fixedBoundaries) ApplyBoundaryMask(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
@@ -121,19 +121,19 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& i
 | 
			
		||||
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
 | 
			
		||||
  MooeeDag(in, Tmp);
 | 
			
		||||
  axpy(out, 1.0, out, Tmp);
 | 
			
		||||
  if(open_boundaries) ApplyBoundaryMask(out);
 | 
			
		||||
  if(fixedBoundaries) ApplyBoundaryMask(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
 | 
			
		||||
  WilsonBase::Meooe(in, out);
 | 
			
		||||
  if(open_boundaries) ApplyBoundaryMask(out);
 | 
			
		||||
  if(fixedBoundaries) ApplyBoundaryMask(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
 | 
			
		||||
  WilsonBase::MeooeDag(in, out);
 | 
			
		||||
  if(open_boundaries) ApplyBoundaryMask(out);
 | 
			
		||||
  if(fixedBoundaries) ApplyBoundaryMask(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
@@ -147,7 +147,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField&
 | 
			
		||||
  } else {
 | 
			
		||||
    MooeeInternal(in, out, Diagonal, Triangle);
 | 
			
		||||
  }
 | 
			
		||||
  if(open_boundaries) ApplyBoundaryMask(out);
 | 
			
		||||
  if(fixedBoundaries) ApplyBoundaryMask(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
@@ -166,7 +166,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel
 | 
			
		||||
  } else {
 | 
			
		||||
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
 | 
			
		||||
  }
 | 
			
		||||
  if(open_boundaries) ApplyBoundaryMask(out);
 | 
			
		||||
  if(fixedBoundaries) ApplyBoundaryMask(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
@@ -186,7 +186,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField
 | 
			
		||||
 | 
			
		||||
template<class Impl, class CloverHelpers>
 | 
			
		||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
 | 
			
		||||
  assert(!open_boundaries); // TODO check for changes required for open bc
 | 
			
		||||
  assert(!fixedBoundaries); // TODO check for changes required for open bc
 | 
			
		||||
 | 
			
		||||
  // NOTE: code copied from original clover term
 | 
			
		||||
  conformable(X.Grid(), Y.Grid());
 | 
			
		||||
@@ -305,6 +305,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
 | 
			
		||||
  GridBase* grid = _Umu.Grid();
 | 
			
		||||
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
 | 
			
		||||
  CloverField TmpOriginal(grid);
 | 
			
		||||
  CloverField TmpInverse(grid);
 | 
			
		||||
 | 
			
		||||
  // Compute the field strength terms mu>nu
 | 
			
		||||
  double t2 = usecond();
 | 
			
		||||
@@ -324,24 +325,27 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
 | 
			
		||||
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
 | 
			
		||||
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
 | 
			
		||||
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
 | 
			
		||||
  // Handle mass term based on clover policy
 | 
			
		||||
  CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
 | 
			
		||||
 | 
			
		||||
  // Instantiate the clover term
 | 
			
		||||
  // - In case of the standard clover the mass term is added
 | 
			
		||||
  // - In case of the exponential clover the clover term is exponentiated
 | 
			
		||||
  double t4 = usecond();
 | 
			
		||||
  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
 | 
			
		||||
 | 
			
		||||
  // Convert the data layout of the clover term
 | 
			
		||||
  double t4 = usecond();
 | 
			
		||||
  double t5 = usecond();
 | 
			
		||||
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
 | 
			
		||||
 | 
			
		||||
  // Exponentiate the clover (nothing happens in case of the standard clover)
 | 
			
		||||
  double t5 = usecond();
 | 
			
		||||
  CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
 | 
			
		||||
 | 
			
		||||
  // Possible modify the boundary values
 | 
			
		||||
  // Modify the clover term at the temporal boundaries in case of open boundary conditions
 | 
			
		||||
  double t6 = usecond();
 | 
			
		||||
  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
 | 
			
		||||
  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
 | 
			
		||||
 | 
			
		||||
  // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
 | 
			
		||||
  // Invert the Clover term
 | 
			
		||||
  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
 | 
			
		||||
  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
 | 
			
		||||
  // TODO: For now this inversion is explictly done on the CPU
 | 
			
		||||
  double t7 = usecond();
 | 
			
		||||
  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
 | 
			
		||||
  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
 | 
			
		||||
 | 
			
		||||
  // Fill the remaining clover fields
 | 
			
		||||
  double t8 = usecond();
 | 
			
		||||
@@ -362,10 +366,10 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
 | 
			
		||||
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "convert =                    " << (t5 - t4) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "exponentiation =             " << (t6 - t5) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "boundaries =                 " << (t7 - t6) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "inversions =                 " << (t8 - t7) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -91,11 +91,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class cobj,class vobj,class compressor>
 | 
			
		||||
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
 | 
			
		||||
				 commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
 | 
			
		||||
				 commVector<cobj *> pointers,
 | 
			
		||||
				 int dimension,int plane,
 | 
			
		||||
				 int cbmask,compressor &compress,int type) __attribute__((noinline));
 | 
			
		||||
 | 
			
		||||
template<class cobj,class vobj,class compressor>
 | 
			
		||||
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 | 
			
		||||
				 Vector<cobj *> pointers,int dimension,int plane,int cbmask,
 | 
			
		||||
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
 | 
			
		||||
				 const Lattice<vobj> &rhs,
 | 
			
		||||
				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
 | 
			
		||||
				 compressor &compress,int type)
 | 
			
		||||
{
 | 
			
		||||
  assert( (table.size()&0x1)==0);
 | 
			
		||||
@@ -103,14 +106,15 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
 | 
			
		||||
  int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
 | 
			
		||||
 | 
			
		||||
  auto rhs_v = rhs.View(AcceleratorRead);
 | 
			
		||||
  auto rhs_p = &rhs_v[0];
 | 
			
		||||
  auto p0=&pointers[0][0];
 | 
			
		||||
  auto p1=&pointers[1][0];
 | 
			
		||||
  auto tp=&table[0];
 | 
			
		||||
  accelerator_forNB(j, num, vobj::Nsimd(), {
 | 
			
		||||
      compress.CompressExchange(p0,p1, &rhs_v[0], j,
 | 
			
		||||
			      so+tp[2*j  ].second,
 | 
			
		||||
			      so+tp[2*j+1].second,
 | 
			
		||||
			      type);
 | 
			
		||||
      compress.CompressExchange(p0,p1, rhs_p, j,
 | 
			
		||||
				so+tp[2*j  ].second,
 | 
			
		||||
				so+tp[2*j+1].second,
 | 
			
		||||
				type);
 | 
			
		||||
  });
 | 
			
		||||
  rhs_v.ViewClose();
 | 
			
		||||
}
 | 
			
		||||
@@ -257,8 +261,8 @@ public:
 | 
			
		||||
  struct Merge {
 | 
			
		||||
    static constexpr int Nsimd = vobj::Nsimd();
 | 
			
		||||
    cobj * mpointer;
 | 
			
		||||
    Vector<scalar_object *> rpointers;
 | 
			
		||||
    Vector<cobj *> vpointers;
 | 
			
		||||
    //    std::vector<scalar_object *> rpointers;
 | 
			
		||||
    std::vector<cobj *> vpointers;
 | 
			
		||||
    Integer buffer_size;
 | 
			
		||||
    Integer type;
 | 
			
		||||
    Integer partial; // partial dirichlet BCs
 | 
			
		||||
@@ -432,6 +436,7 @@ public:
 | 
			
		||||
					Packets[i].from_rank,Packets[i].do_recv,
 | 
			
		||||
					Packets[i].xbytes,Packets[i].rbytes,i);
 | 
			
		||||
    }
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
@@ -625,7 +630,7 @@ public:
 | 
			
		||||
    d.buffer_size = buffer_size;
 | 
			
		||||
    dv.push_back(d);
 | 
			
		||||
  }
 | 
			
		||||
  void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
 | 
			
		||||
  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
 | 
			
		||||
    Merge m;
 | 
			
		||||
    m.partial  = this->partialDirichlet;
 | 
			
		||||
    m.dims     = _grid->_fdimensions;
 | 
			
		||||
@@ -1268,8 +1273,8 @@ public:
 | 
			
		||||
    
 | 
			
		||||
    assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
 | 
			
		||||
 | 
			
		||||
    Vector<cobj *> rpointers(maxl);
 | 
			
		||||
    Vector<cobj *> spointers(maxl);
 | 
			
		||||
    std::vector<cobj *> rpointers(maxl);
 | 
			
		||||
    std::vector<cobj *> spointers(maxl);
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////
 | 
			
		||||
    // Work out what to send where
 | 
			
		||||
 
 | 
			
		||||
@@ -305,14 +305,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 | 
			
		||||
     });	   			              \
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n");  theGridAccelerator->wait(); }
 | 
			
		||||
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
 | 
			
		||||
 | 
			
		||||
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
 | 
			
		||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
 | 
			
		||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 | 
			
		||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 | 
			
		||||
 | 
			
		||||
inline void acceleratorCopySynchronise(void) {  printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); }
 | 
			
		||||
inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
 | 
			
		||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
 | 
			
		||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 | 
			
		||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										131
									
								
								benchmarks/Benchmark_halo.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								benchmarks/Benchmark_halo.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,131 @@
 | 
			
		||||
 /*************************************************************************************
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
    Source file: ./benchmarks/Benchmark_dwf.cc
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#ifdef GRID_CUDA
 | 
			
		||||
#define CUDA_PROFILE
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef CUDA_PROFILE
 | 
			
		||||
#include <cuda_profiler_api.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
  Gamma::Algebra Gmu [] = {
 | 
			
		||||
    Gamma::Algebra::GammaX,
 | 
			
		||||
    Gamma::Algebra::GammaY,
 | 
			
		||||
    Gamma::Algebra::GammaZ,
 | 
			
		||||
    Gamma::Algebra::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  Coordinate latt4= GridDefaultLatt();
 | 
			
		||||
  Coordinate mpi  = GridDefaultMpi();
 | 
			
		||||
  Coordinate simd = GridDefaultSimd(Nd,vComplexF::Nsimd());
 | 
			
		||||
 | 
			
		||||
  GridLogLayout();
 | 
			
		||||
 | 
			
		||||
  int Ls=16;
 | 
			
		||||
  for(int i=0;i<argc;i++)
 | 
			
		||||
    if(std::string(argv[i]) == "-Ls"){
 | 
			
		||||
      std::stringstream ss(argv[i+1]); ss >> Ls;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,simd ,mpi);
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
 | 
			
		||||
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
 | 
			
		||||
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
 | 
			
		||||
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
 | 
			
		||||
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
  LatticeFermionF src   (FGrid); random(RNG5,src);
 | 
			
		||||
  RealD N2 = 1.0/::sqrt(norm2(src));
 | 
			
		||||
  src = src*N2;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
 | 
			
		||||
  LatticeGaugeFieldF Umu(UGrid);
 | 
			
		||||
  SU<Nc>::HotConfiguration(RNG4,Umu);
 | 
			
		||||
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
  RealD NP = UGrid->_Nprocessors;
 | 
			
		||||
  RealD NN = UGrid->NodeCount();
 | 
			
		||||
 | 
			
		||||
  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
  const int ncall = 500;
 | 
			
		||||
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::HaloGatherOpt         "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
  {
 | 
			
		||||
    typename DomainWallFermionF::Compressor compressor(0);
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
    Dw.Stencil.HaloExchangeOptGather(src,compressor);
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      Dw.Stencil.HaloExchangeOptGather(src,compressor);
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
 | 
			
		||||
    double bytes=0.0;
 | 
			
		||||
    if(mpi[0]) bytes+=latt4[1]*latt4[2]*latt4[3];
 | 
			
		||||
    if(mpi[1]) bytes+=latt4[0]*latt4[2]*latt4[3];
 | 
			
		||||
    if(mpi[2]) bytes+=latt4[0]*latt4[1]*latt4[3];
 | 
			
		||||
    if(mpi[3]) bytes+=latt4[0]*latt4[1]*latt4[2];
 | 
			
		||||
    bytes = bytes * Ls * 8.* (24.+12.)* 2.0;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "Gather us /call =   "<< (t1-t0)/ncall<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Gather MBs /call =   "<< bytes*ncall/(t1-t0)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
  exit(0);
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										4
									
								
								systems/PVC/benchmarks/run-1tile.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										4
									
								
								systems/PVC/benchmarks/run-1tile.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							@@ -6,7 +6,7 @@
 | 
			
		||||
 | 
			
		||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
 | 
			
		||||
 | 
			
		||||
export NT=16
 | 
			
		||||
export NT=8
 | 
			
		||||
 | 
			
		||||
export I_MPI_OFFLOAD=1
 | 
			
		||||
export I_MPI_OFFLOAD_TOPOLIB=level_zero
 | 
			
		||||
@@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile
 | 
			
		||||
export EnableImplicitScaling=0
 | 
			
		||||
export EnableWalkerPartition=0
 | 
			
		||||
export ZE_AFFINITY_MASK=0.0
 | 
			
		||||
mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8
 | 
			
		||||
mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768
 | 
			
		||||
 | 
			
		||||
export ZE_AFFINITY_MASK=0
 | 
			
		||||
export I_MPI_OFFLOAD_CELL=device
 | 
			
		||||
 
 | 
			
		||||
@@ -1,13 +1,13 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
##SBATCH -p PVC-SPR-QZEH 
 | 
			
		||||
##SBATCH -p PVC-ICX-QZNW
 | 
			
		||||
 | 
			
		||||
#SBATCH -p QZ1J-ICX-PVC
 | 
			
		||||
 | 
			
		||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
 | 
			
		||||
 | 
			
		||||
export NT=16
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# export IGC_EnableLSCFenceUGMBeforeEOT=0
 | 
			
		||||
# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
 | 
			
		||||
#export IGC_ShaderDumpEnable=1 
 | 
			
		||||
@@ -19,8 +19,16 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
 | 
			
		||||
export I_MPI_OFFLOAD_CELL=tile
 | 
			
		||||
export EnableImplicitScaling=0
 | 
			
		||||
export EnableWalkerPartition=0
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
 | 
			
		||||
 | 
			
		||||
mpiexec -launcher ssh -n 1 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log
 | 
			
		||||
for i in 0 
 | 
			
		||||
do
 | 
			
		||||
mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
 | 
			
		||||
mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
 | 
			
		||||
done
 | 
			
		||||
#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log
 | 
			
		||||
#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log
 | 
			
		||||
 | 
			
		||||
mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -7,8 +7,8 @@ echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
 | 
			
		||||
 | 
			
		||||
if [ $MPI_LOCALRANKID = "0" ] 
 | 
			
		||||
then
 | 
			
		||||
#  ~psteinbr/build_pti/ze_tracer -c $@
 | 
			
		||||
  onetrace --chrome-kernel-timeline $@
 | 
			
		||||
#  ~psteinbr/build_pti/ze_tracer -h $@
 | 
			
		||||
  onetrace --chrome-device-timeline $@
 | 
			
		||||
else
 | 
			
		||||
  $@
 | 
			
		||||
fi
 | 
			
		||||
 
 | 
			
		||||
@@ -2,14 +2,16 @@ INSTALL=/nfs/site/home/azusayax/install
 | 
			
		||||
../../configure \
 | 
			
		||||
	--enable-simd=GPU \
 | 
			
		||||
	--enable-gen-simd-width=64 \
 | 
			
		||||
	--enable-comms=mpi \
 | 
			
		||||
	--enable-comms=mpi-auto \
 | 
			
		||||
	--disable-accelerator-cshift \
 | 
			
		||||
	--disable-gparity \
 | 
			
		||||
	--disable-fermion-reps \
 | 
			
		||||
	--enable-shm=nvlink \
 | 
			
		||||
	--enable-accelerator=sycl \
 | 
			
		||||
	--enable-unified=yes \
 | 
			
		||||
	CXX=mpicxx \
 | 
			
		||||
	--enable-unified=no \
 | 
			
		||||
	MPICXX=mpicxx \
 | 
			
		||||
	CXX=dpcpp \
 | 
			
		||||
	LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
 | 
			
		||||
	CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare"
 | 
			
		||||
	CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,2 @@
 | 
			
		||||
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi
 | 
			
		||||
#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU-RRII --enable-comms=mpi
 | 
			
		||||
#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU --enable-debug --enable-comms=mpi
 | 
			
		||||
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi --enable-unified=yes
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,35 +1,12 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    grid` physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_cshift.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 ;
 | 
			
		||||
Gamma::Algebra Gmu [] = {
 | 
			
		||||
  Gamma::Algebra::GammaX,
 | 
			
		||||
  Gamma::Algebra::GammaY,
 | 
			
		||||
  Gamma::Algebra::GammaZ,
 | 
			
		||||
  Gamma::Algebra::GammaT,
 | 
			
		||||
  Gamma::Algebra::Gamma5
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
@@ -49,22 +26,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian RBGRID(&GRID);
 | 
			
		||||
 | 
			
		||||
  LatticeComplexD     one(&GRID);
 | 
			
		||||
  LatticeComplexD      zz(&GRID);
 | 
			
		||||
  LatticeComplexD       C(&GRID);
 | 
			
		||||
  LatticeComplexD  Ctilde(&GRID);
 | 
			
		||||
  LatticeComplexD  Cref  (&GRID);
 | 
			
		||||
  LatticeComplexD  Csav  (&GRID);
 | 
			
		||||
  LatticeComplexD    coor(&GRID);
 | 
			
		||||
 | 
			
		||||
  LatticeSpinMatrixD    S(&GRID);
 | 
			
		||||
  LatticeSpinMatrixD    Stilde(&GRID);
 | 
			
		||||
  
 | 
			
		||||
  Coordinate p({1,3,2,3});
 | 
			
		||||
 | 
			
		||||
  one = ComplexD(1.0,0.0);
 | 
			
		||||
  zz  = ComplexD(0.0,0.0);
 | 
			
		||||
 | 
			
		||||
  ComplexD ci(0.0,1.0);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
@@ -73,7 +36,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeFieldD Umu(&GRID);
 | 
			
		||||
 | 
			
		||||
  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
@@ -81,16 +43,78 @@ int main (int argc, char ** argv)
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
  {
 | 
			
		||||
    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
 | 
			
		||||
    LatticeFermionD    src_p(&GRID);
 | 
			
		||||
    LatticeFermionD    tmp(&GRID);
 | 
			
		||||
    LatticeFermionD    ref(&GRID);
 | 
			
		||||
    LatticeFermionD    result(&GRID);
 | 
			
		||||
    
 | 
			
		||||
    RealD mass=0.01;
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
 | 
			
		||||
    
 | 
			
		||||
    Dw.M(src,tmp);
 | 
			
		||||
    Dw.M(src,ref);
 | 
			
		||||
    std::cout << "Norm src "<<norm2(src)<<std::endl;
 | 
			
		||||
    std::cout << "Norm Dw x src "<<norm2(ref)<<std::endl;
 | 
			
		||||
    {
 | 
			
		||||
      FFT theFFT(&GRID);
 | 
			
		||||
 | 
			
		||||
      ////////////////
 | 
			
		||||
      // operator in Fourier space
 | 
			
		||||
      ////////////////
 | 
			
		||||
      tmp =ref;
 | 
			
		||||
      theFFT.FFT_all_dim(result,tmp,FFT::forward);
 | 
			
		||||
      std::cout<<"FFT[ Dw x src ]  "<< norm2(result)<<std::endl;    
 | 
			
		||||
 | 
			
		||||
      tmp = src;
 | 
			
		||||
      theFFT.FFT_all_dim(src_p,tmp,FFT::forward);
 | 
			
		||||
      std::cout<<"FFT[ src      ]  "<< norm2(src_p)<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
      /////////////////////////////////////////////////////////////////
 | 
			
		||||
      // work out the predicted FT from Fourier
 | 
			
		||||
      /////////////////////////////////////////////////////////////////
 | 
			
		||||
      auto FGrid = &GRID;
 | 
			
		||||
      LatticeFermionD    Kinetic(FGrid); Kinetic = Zero();
 | 
			
		||||
      LatticeComplexD    kmu(FGrid); 
 | 
			
		||||
      LatticeInteger     scoor(FGrid); 
 | 
			
		||||
      LatticeComplexD    sk (FGrid); sk = Zero();
 | 
			
		||||
      LatticeComplexD    sk2(FGrid); sk2= Zero();
 | 
			
		||||
      LatticeComplexD    W(FGrid); W= Zero();
 | 
			
		||||
      LatticeComplexD    one(FGrid); one =ComplexD(1.0,0.0);
 | 
			
		||||
      ComplexD ci(0.0,1.0);
 | 
			
		||||
    
 | 
			
		||||
      for(int mu=0;mu<Nd;mu++) {
 | 
			
		||||
	
 | 
			
		||||
	RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 | 
			
		||||
 | 
			
		||||
	LatticeCoordinate(kmu,mu);
 | 
			
		||||
 | 
			
		||||
	kmu = TwoPiL * kmu;
 | 
			
		||||
      
 | 
			
		||||
	sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
 | 
			
		||||
	sk  = sk  +     sin(kmu)    *sin(kmu); 
 | 
			
		||||
      
 | 
			
		||||
	// -1/2 Dw ->  1/2 gmu (eip - emip) = i sinp gmu
 | 
			
		||||
	Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src_p);
 | 
			
		||||
	
 | 
			
		||||
      }
 | 
			
		||||
    
 | 
			
		||||
      W = mass + sk2; 
 | 
			
		||||
      Kinetic = Kinetic + W * src_p;
 | 
			
		||||
    
 | 
			
		||||
      std::cout<<"Momentum space src         "<< norm2(src_p)<<std::endl;
 | 
			
		||||
      std::cout<<"Momentum space Dw x src    "<< norm2(Kinetic)<<std::endl;
 | 
			
		||||
      std::cout<<"FT[Coordinate space Dw]    "<< norm2(result)<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
      result = result - Kinetic;
 | 
			
		||||
      std::cout<<"diff "<< norm2(result)<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    std::cout << " =======================================" <<std::endl;
 | 
			
		||||
    std::cout << " Checking FourierFreePropagator x Dw = 1" <<std::endl;
 | 
			
		||||
    std::cout << " =======================================" <<std::endl;
 | 
			
		||||
    std::cout << "Dw src = " <<norm2(src)<<std::endl;
 | 
			
		||||
    std::cout << "Dw tmp = " <<norm2(tmp)<<std::endl;
 | 
			
		||||
    Dw.M(src,tmp);
 | 
			
		||||
 | 
			
		||||
    Dw.FreePropagator(tmp,ref,mass);
 | 
			
		||||
 | 
			
		||||
@@ -122,7 +146,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
    ferm()(0)(0) = ComplexD(1.0);
 | 
			
		||||
    pokeSite(ferm,src,point);
 | 
			
		||||
 | 
			
		||||
    RealD mass=0.01;
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
 | 
			
		||||
    WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
 | 
			
		||||
 | 
			
		||||
    // Momentum space prop
 | 
			
		||||
@@ -155,6 +180,65 @@ int main (int argc, char ** argv)
 | 
			
		||||
    DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
  //Gauge invariance test
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
  {
 | 
			
		||||
    std::cout<<"****************************************"<<std::endl;
 | 
			
		||||
    std::cout << "Gauge invariance test \n";
 | 
			
		||||
    std::cout<<"****************************************"<<std::endl;
 | 
			
		||||
    LatticeGaugeField     U_GT(&GRID); // Gauge transformed field
 | 
			
		||||
    LatticeColourMatrix   g(&GRID);    // local Gauge xform matrix
 | 
			
		||||
    U_GT = Umu;
 | 
			
		||||
    // Make a random xform to teh gauge field
 | 
			
		||||
    SU<Nc>::RandomGaugeTransform(pRNG,U_GT,g); // Unit gauge
 | 
			
		||||
 | 
			
		||||
    LatticeFermionD    src(&GRID);
 | 
			
		||||
    LatticeFermionD    tmp(&GRID);
 | 
			
		||||
    LatticeFermionD    ref(&GRID);
 | 
			
		||||
    LatticeFermionD    diff(&GRID);
 | 
			
		||||
 | 
			
		||||
    // could loop over colors
 | 
			
		||||
    src=Zero();
 | 
			
		||||
    Coordinate point(4,0); // 0,0,0,0
 | 
			
		||||
    SpinColourVectorD ferm;
 | 
			
		||||
    ferm=Zero();
 | 
			
		||||
    ferm()(0)(0) = ComplexD(1.0);
 | 
			
		||||
    pokeSite(ferm,src,point);
 | 
			
		||||
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    WilsonFermionD Dw(U_GT,GRID,RBGRID,mass);
 | 
			
		||||
 | 
			
		||||
    // Momentum space prop
 | 
			
		||||
    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
 | 
			
		||||
    Dw.FreePropagator(src,ref,mass) ;
 | 
			
		||||
 | 
			
		||||
    Gamma G5(Gamma::Algebra::Gamma5);
 | 
			
		||||
 | 
			
		||||
    LatticeFermionD    result(&GRID); 
 | 
			
		||||
    const int sdir=0;
 | 
			
		||||
    
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Conjugate gradient on normal equations system
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
 | 
			
		||||
    Dw.Mdag(src,tmp);
 | 
			
		||||
    src=tmp;
 | 
			
		||||
    MdagMLinearOperator<WilsonFermionD,LatticeFermionD> HermOp(Dw);
 | 
			
		||||
    ConjugateGradient<LatticeFermionD> CG(1.0e-10,10000);
 | 
			
		||||
    CG(HermOp,src,result);
 | 
			
		||||
    
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    std::cout << " Taking difference" <<std::endl;
 | 
			
		||||
    std::cout << "Dw result "<<norm2(result)<<std::endl;
 | 
			
		||||
    std::cout << "Dw ref     "<<norm2(ref)<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    diff = ref - result;
 | 
			
		||||
    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
 | 
			
		||||
 | 
			
		||||
    DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										110
									
								
								tests/core/Test_memory_manager.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										110
									
								
								tests/core/Test_memory_manager.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,110 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_memory_manager.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2022
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pboyle@bnl.gov>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
void  MemoryTest(GridCartesian         * FGrid,int N);
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
 | 
			
		||||
  int N=100;
 | 
			
		||||
  for(int i=0;i<N;i++){
 | 
			
		||||
    std::cout << "============================"<<std::endl;
 | 
			
		||||
    std::cout << "Epoch "<<i<<"/"<<N<<std::endl;
 | 
			
		||||
    std::cout << "============================"<<std::endl;
 | 
			
		||||
    MemoryTest(UGrid,256);
 | 
			
		||||
    MemoryManager::Print();
 | 
			
		||||
    AUDIT();
 | 
			
		||||
  }
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void  MemoryTest(GridCartesian         * FGrid, int N)
 | 
			
		||||
{
 | 
			
		||||
  LatticeComplexD zero(FGrid); zero=Zero();
 | 
			
		||||
  std::vector<LatticeComplexD> A(N,zero);//FGrid);
 | 
			
		||||
 | 
			
		||||
  std::vector<ComplexD> B(N,ComplexD(0.0)); // Update sequentially on host
 | 
			
		||||
 | 
			
		||||
  for(int v=0;v<N;v++) A[v] = Zero();
 | 
			
		||||
 | 
			
		||||
  uint64_t counter = 0;
 | 
			
		||||
  for(int epoch = 0;epoch<10000;epoch++){
 | 
			
		||||
 | 
			
		||||
    int v  = random() %N; // Which vec
 | 
			
		||||
    int w  = random() %2; // Write or read
 | 
			
		||||
    int e  = random() %3; // expression or for loop
 | 
			
		||||
    int dev= random() %2; // On device?
 | 
			
		||||
    //    int e=1;
 | 
			
		||||
    ComplexD zc = counter++;
 | 
			
		||||
    
 | 
			
		||||
    if ( w ) {
 | 
			
		||||
      B[v] = B[v] + zc;
 | 
			
		||||
      if ( e == 0 ) {
 | 
			
		||||
	A[v] = A[v] + zc - A[v] + A[v];
 | 
			
		||||
      } else {
 | 
			
		||||
	if ( dev ) { 
 | 
			
		||||
	  autoView(A_v,A[v],AcceleratorWrite);
 | 
			
		||||
	  accelerator_for(ss,FGrid->oSites(),1,{
 | 
			
		||||
	    A_v[ss] = A_v[ss] + zc;
 | 
			
		||||
	    });
 | 
			
		||||
	} else {
 | 
			
		||||
	  autoView(A_v,A[v],CpuWrite);
 | 
			
		||||
	  thread_for(ss,FGrid->oSites(),{
 | 
			
		||||
	      A_v[ss] = A_v[ss] + zc;
 | 
			
		||||
	    });
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      if ( e == 0 ) {
 | 
			
		||||
	A[v] = A[v] + A[v] - A[v];
 | 
			
		||||
      } else { 
 | 
			
		||||
	if ( dev ) { 
 | 
			
		||||
	  autoView(A_v,A[v],AcceleratorRead);
 | 
			
		||||
	  accelerator_for(ss,FGrid->oSites(),1,{
 | 
			
		||||
	      assert(B[v]==A_v[ss]()()().getlane(0));
 | 
			
		||||
	    });
 | 
			
		||||
	  //	std::cout << "["<<v<<"] checked on GPU"<<B[v]<<std::endl;
 | 
			
		||||
	} else {
 | 
			
		||||
	  autoView(A_v,A[v],CpuRead);
 | 
			
		||||
	  thread_for(ss,FGrid->oSites(),{
 | 
			
		||||
	      assert(B[v]==A_v[ss]()()().getlane(0));
 | 
			
		||||
	    });
 | 
			
		||||
	  //	std::cout << "["<<v<<"] checked on CPU"<<B[v]<<std::endl;
 | 
			
		||||
	}
 | 
			
		||||
      }    
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user