1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-24 18:52:02 +01:00

Compare commits

..

2 Commits

110 changed files with 1841 additions and 5665 deletions

View File

@ -44,22 +44,14 @@ directory
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
//disables nvcc specific warning in json.hpp
#pragma nv_diag_suppress unsigned_compare_with_zero
#pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress extra_semicolon
#else
//disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
#endif
//Eigen only
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC

View File

@ -16,7 +16,6 @@
#include <functional>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <stdio.h>
#include <signal.h>
#include <ctime>

View File

@ -14,11 +14,7 @@
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#pragma nv_diag_suppress code_is_unreachable
#else
#pragma diag_suppress code_is_unreachable
#endif
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")

View File

@ -113,43 +113,7 @@ public:
blockPromote(guess_coarse,guess,subspace);
guess.Checkerboard() = src.Checkerboard();
};
void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
int Nevec = (int)evec_coarse.size();
int Nsrc = (int)src.size();
// make temp variables
std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());
//Preporcessing
std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
guess_coarse[j] = Zero();
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockProject(src_coarse[j],src[j],subspace);
}
//deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
for (int i=0;i<Nevec;i++)
{
std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
const CoarseField & tmp = evec_coarse[i];
for (int j=0;j<Nsrc;j++)
{
axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
}
}
//postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard();
}
};
};
};

View File

@ -113,6 +113,11 @@ private:
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
static uint64_t DeviceAccesses;
static uint64_t HostAccesses;
static uint64_t DeviceAccessBytes;
static uint64_t HostAccessBytes;
private:
#ifndef GRID_UVM
@ -152,6 +157,7 @@ private:
// static void LRUupdate(AcceleratorViewEntry &AccCache);
static void LRUinsert(AcceleratorViewEntry &AccCache);
static void LRUinsertback(AcceleratorViewEntry &AccCache);
static void LRUremove(AcceleratorViewEntry &AccCache);
// manage entries in the table

View File

@ -23,6 +23,11 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceAccesses;
uint64_t MemoryManager::HostAccesses;
uint64_t MemoryManager::DeviceAccessBytes;
uint64_t MemoryManager::HostAccessBytes;
////////////////////////////////////
// Priority ordering for unlocked entries
@ -86,6 +91,14 @@ void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
AccCache.LRU_valid = 1;
DeviceLRUBytes+=AccCache.bytes;
}
void MemoryManager::LRUinsertback(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==0);
LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end();
AccCache.LRU_valid = 1;
DeviceLRUBytes+=AccCache.bytes;
}
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==1);
@ -129,6 +142,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
@ -231,6 +245,9 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
EntryCreate(CpuPtr,bytes,mode,hint);
}
DeviceAccesses++;
DeviceAccessBytes+=bytes;
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
@ -349,6 +366,10 @@ void MemoryManager::CpuViewClose(uint64_t CpuPtr)
assert(AccCache.accLock==0);
AccCache.cpuLock--;
if(AccCache.cpuLock==0) {
LRUinsertback(AccCache);
}
}
/*
* Action State StateNext Flush Clone
@ -371,6 +392,9 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
EntryCreate(CpuPtr,bytes,mode,transient);
}
HostAccesses++;
HostAccessBytes+=bytes;
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
@ -416,6 +440,12 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
AccCache.transient= transient? EvictNext : 0;
// If view is opened on host remove from LRU
// Host close says evict next from device
if(AccCache.LRU_valid==1){
LRUremove(AccCache);
}
return AccCache.CpuPtr;
}
void MemoryManager::NotifyDeletion(void *_ptr)

View File

@ -12,6 +12,10 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceAccesses;
uint64_t MemoryManager::HostAccesses;
uint64_t MemoryManager::DeviceAccessBytes;
uint64_t MemoryManager::HostAccessBytes;
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };

View File

@ -53,11 +53,10 @@ public:
// Communicator should know nothing of the physics grid, only processor grid.
////////////////////////////////////////////
int _Nprocessors; // How many in all
int _processor; // linear processor rank
unsigned long _ndimension;
Coordinate _shm_processors; // Which dimensions get relayed out over processors lanes.
Coordinate _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
Coordinate _processor_coor; // linear processor coordinate
unsigned long _ndimension;
static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator;
std::vector<Grid_MPI_Comm> communicator_halo;
@ -98,9 +97,8 @@ public:
int BossRank(void) ;
int ThisRank(void) ;
const Coordinate & ThisProcessorCoor(void) ;
const Coordinate & ShmGrid(void) { return _shm_processors; } ;
const Coordinate & ProcessorGrid(void) ;
int ProcessorCount(void) ;
int ProcessorCount(void) ;
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
@ -144,16 +142,16 @@ public:
int bytes);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,int do_xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,int do_recv,
int recv_from_rank,
int bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,int do_recv,
int recv_from_rank,
int bytes,int dir);

View File

@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
// Remap using the shared memory optimising routine
// The remap creates a comm which must be freed
////////////////////////////////////////////////////
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm,_shm_processors);
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
InitFromMPICommunicator(processors,optimal_comm);
SetCommunicator(optimal_comm);
///////////////////////////////////////////////////
@ -124,13 +124,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1);
Coordinate shm_processors (_ndimension,1);
// Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension;
for(int d=0;d<parent_ndimension;d++){
parent_processor_coor[pad+d]=parent._processor_coor[d];
parent_processors [pad+d]=parent._processors[d];
shm_processors [pad+d]=parent._shm_processors[d];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
@ -155,7 +154,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
ccoor[d] = parent_processor_coor[d] % processors[d];
scoor[d] = parent_processor_coor[d] / processors[d];
ssize[d] = parent_processors[d] / processors[d];
if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
}
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@ -337,22 +335,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
}
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dox,
int dest,
void *recv,
int from, int dor,
int from,
int bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,dir);
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
int dest,
void *recv,
int from,int dor,
int from,
int bytes,int dir)
{
int ncomm =communicator_halo.size();
@ -372,33 +370,30 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
double off_node_bytes=0.0;
int tag;
if ( dox ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
if (dor) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
} else {
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
}
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
} else {
// TODO : make a OMP loop on CPU, call threaded bcopy
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
// std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list,dir);
list.resize(0);
}
return off_node_bytes;

View File

@ -45,14 +45,12 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors)
{
_shm_processors = Coordinate(processors.size(),1);
srank=0;
SetCommunicator(communicator_world);
}
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{
_shm_processors = Coordinate(processors.size(),1);
_processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1);
_processor_coor.resize(_ndimension);
@ -113,18 +111,18 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,int dox,
int xmit_to_rank,
void *recv,
int recv_from_rank,int dor,
int recv_from_rank,
int bytes, int dir)
{
return 2.0*bytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
int xmit_to_rank,
void *recv,
int recv_from_rank,int dor,
int recv_from_rank,
int bytes, int dir)
{
return 2.0*bytes;

View File

@ -93,10 +93,9 @@ public:
// Create an optimal reordered communicator that makes MPI_Cart_create get it right
//////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
// Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
///////////////////////////////////////////////////
// Provide shared memory facilities off comm world

View File

@ -152,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
}
return log2size;
}
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
//////////////////////////////////////////////////////////////////////////////
// Look and see if it looks like an HPE 8600 based on hostname conventions
@ -165,8 +165,8 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm);
}
static inline int divides(int a,int b)
{
@ -221,7 +221,7 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
@ -294,8 +294,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
Coordinate HyperCoor(ndimension);
GetShmDims(WorldDims,ShmDims);
SHM = ShmDims;
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
@ -342,7 +341,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
}
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
@ -354,8 +353,6 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
GetShmDims(WorldDims,ShmDims);
SHM=ShmDims;
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////

View File

@ -48,10 +48,9 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
_ShmSetup=1;
}
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
optimal_comm = WorldComm;
SHM = Coordinate(processors.size(),1);
}
////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -46,4 +46,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>

View File

@ -1,55 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_crc.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
{
auto ff = localNorm2(f);
if ( mu==-1 ) mu = f.Grid()->Nd()-1;
typedef typename vobj::tensor_reduced normtype;
typedef typename normtype::scalar_object scalar;
std::vector<scalar> sff;
sliceSum(ff,sff,mu);
for(int t=0;t<sff.size();t++){
std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
}
}
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
{
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid);

View File

@ -142,15 +142,6 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sumD_gpu_large(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
@ -168,22 +159,6 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu_large(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum);
return ssum;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
}
template <class Iterator>
int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
#ifdef GRID_CUDA
@ -37,13 +37,13 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
/*
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
*/
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
@ -53,12 +53,12 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
threads = warpSize;
if ( threads*sizeofsobj > sharedMemPerBlock ) {
std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
return 0;
exit(EXIT_FAILURE);
}
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);
return 1;
}
template <class sobj, class Iterator>
@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
@ -207,9 +207,7 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
assert(ok);
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
@ -220,54 +218,6 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
auto result = buffer_v[0];
return result;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
scalarD *ret_p = (scalarD *)&ret;
const int words = sizeof(vobj)/sizeof(vector);
Vector<vector> buffer(osites);
vector *dat = (vector *)lat;
vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
ret_p[w] = sumD_gpu_small(tbuf,osites);
}
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
if ( ok ) {
ret = sumD_gpu_small(lat,osites);
} else {
ret = sumD_gpu_large(lat,osites);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -280,13 +230,6 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
NAMESPACE_END(Grid);

View File

@ -69,7 +69,6 @@ GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(0);
@ -80,7 +79,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogPerformance.Active(0);
GridLogIntegrator.Active(1);
GridLogColours.Active(0);
GridLogHMC.Active(1);
for (int i = 0; i < logstreams.size(); i++) {
if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
@ -89,8 +87,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("NoIntegrator")) GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
}
}

View File

@ -182,7 +182,6 @@ extern GridLogger GridLogDebug ;
extern GridLogger GridLogPerformance;
extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogHMC;
extern Colours GridLogColours;
std::string demangle(const char* name) ;

View File

@ -31,7 +31,6 @@ directory
#include <fstream>
#include <iomanip>
#include <iostream>
#include <string>
#include <map>
#include <pwd.h>
@ -655,8 +654,7 @@ class IldgWriter : public ScidacWriter {
// Fill ILDG header data struct
//////////////////////////////////////////////////////
ildgFormat ildgfmt ;
const std::string stNC = std::to_string( Nc ) ;
ildgfmt.field = std::string("su"+stNC+"gauge");
ildgfmt.field = std::string("su3gauge");
if ( format == std::string("IEEE32BIG") ) {
ildgfmt.precision = 32;
@ -873,8 +871,7 @@ class IldgReader : public GridLimeReader {
} else {
assert(found_ildgFormat);
const std::string stNC = std::to_string( Nc ) ;
assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
assert ( ildgFormat_.field == std::string("su3gauge") );
///////////////////////////////////////////////////////////////////////////////////////
// Populate our Grid metadata as best we can
@ -882,7 +879,7 @@ class IldgReader : public GridLimeReader {
std::ostringstream vers; vers << ildgFormat_.version;
FieldMetaData_.hdr_version = vers.str();
FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
FieldMetaData_.nd=4;
FieldMetaData_.dimension.resize(4);

View File

@ -6,8 +6,8 @@
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -182,8 +182,8 @@ class GaugeStatistics
public:
void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{
header.link_trace = WilsonLoops<Impl>::linkTrace(data);
header.plaquette = WilsonLoops<Impl>::avgPlaquette(data);
header.link_trace=WilsonLoops<Impl>::linkTrace(data);
header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
}
};
typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@ -203,24 +203,20 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
assert( Nc < 4 && Nc > 1 ) ;
const int x=0;
const int y=1;
const int z=2;
for(int mu=0;mu<Nd;mu++){
#if Nc == 2
cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
cm(mu)()(1,1) = adj(cm(mu)()(0,x)) ;
#else
const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
#endif
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
}
}
////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage
////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@ -282,6 +278,7 @@ struct GaugeSimpleMunger{
template <class fobj, class sobj>
struct GaugeSimpleUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
@ -320,8 +317,8 @@ template<class fobj,class sobj>
struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<Nc-1;i++){
for(int j=0;j<Nc;j++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)()(i,j) = in(mu)(i)(j);
}}
}
@ -333,8 +330,8 @@ template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<Nc-1;i++){
for(int j=0;j<Nc;j++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}

View File

@ -9,7 +9,6 @@
Author: Matt Spraggs <matthew.spraggs@gmail.com>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -31,8 +30,6 @@
#ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H
#include <string>
NAMESPACE_BEGIN(Grid);
using namespace Grid;
@ -148,17 +145,15 @@ public:
std::string format(header.floating_point);
const int ieee32big = (format == std::string("IEEE32BIG"));
const int ieee32 = (format == std::string("IEEE32"));
const int ieee64big = (format == std::string("IEEE64BIG"));
const int ieee64 = (format == std::string("IEEE64") || \
format == std::string("IEEE64LITTLE"));
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
const std::string stNC = std::to_string( Nc ) ;
if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@ -169,7 +164,7 @@ public:
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@ -214,29 +209,27 @@ public:
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
std::string ens_label = std::string("DWF"),
std::string ens_id = std::string("UKQCD"),
unsigned int sequence_number = 1)
std::string ens_label = std::string("DWF"))
{
writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
writeConfiguration(Umu,file,0,1,ens_label);
}
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
int two_row,
int bits32,
std::string ens_label = std::string("DWF"),
std::string ens_id = std::string("UKQCD"),
unsigned int sequence_number = 1)
std::string ens_label = std::string("DWF"))
{
typedef vLorentzColourMatrixD vobj;
typedef typename vobj::scalar_object sobj;
FieldMetaData header;
header.sequence_number = sequence_number;
header.ensemble_id = ens_id;
///////////////////////////////////////////
// Following should become arguments
///////////////////////////////////////////
header.sequence_number = 1;
header.ensemble_id = std::string("UKQCD");
header.ensemble_label = ens_label;
header.hdr_version = "1.0" ;
typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D;
@ -250,14 +243,10 @@ public:
uint64_t offset;
// Sod it -- always write NcxNc double
header.floating_point = std::string("IEEE64BIG");
const std::string stNC = std::to_string( Nc ) ;
if( two_row ) {
header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
} else {
header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
}
// Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
@ -265,15 +254,8 @@ public:
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
if( two_row ) {
Gauge3x2unmunger<fobj2D,sobj> munge;
BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
} else {
GaugeSimpleUnmunger<fobj3D,sobj> munge;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
}
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
writeHeader(header,file);
@ -305,7 +287,8 @@ public:
header.plaquette=0.0;
MachineCharacteristics(header);
uint64_t offset;
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48");
@ -345,7 +328,7 @@ public:
GridBase *grid = parallel.Grid();
uint64_t offset = readHeader(file,grid,header);
uint64_t offset = readHeader(file,grid,header);
FieldMetaData clone(header);

View File

@ -16,12 +16,8 @@
#ifdef __NVCC__
#pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#else
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#endif
#endif
#include "pugixml.h"

View File

@ -40,29 +40,6 @@ class Action
public:
bool is_smeared = false;
RealD deriv_norm_sum;
RealD deriv_max_sum;
int deriv_num;
RealD deriv_us;
RealD S_us;
RealD refresh_us;
void reset_timer(void) {
deriv_us = S_us = refresh_us = 0.0;
deriv_num=0;
deriv_norm_sum = deriv_max_sum=0.0;
}
void deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
RealD deriv_max_average(void) { return deriv_max_sum/deriv_num; };
RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; };
RealD deriv_timer(void) { return deriv_us; };
RealD S_timer(void) { return deriv_us; };
RealD refresh_timer(void) { return deriv_us; };
void deriv_timer_start(void) { deriv_us-=usecond(); }
void deriv_timer_stop(void) { deriv_us+=usecond(); }
void refresh_timer_start(void) { refresh_us-=usecond(); }
void refresh_timer_stop(void) { refresh_us+=usecond(); }
void S_timer_start(void) { S_us-=usecond(); }
void S_timer_stop(void) { S_us+=usecond(); }
// Heatbath?
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action

View File

@ -37,10 +37,6 @@ NAMESPACE_CHECK(ActionSet);
#include <Grid/qcd/action/ActionParams.h>
NAMESPACE_CHECK(ActionParams);
#include <Grid/qcd/action/filters/MomentumFilter.h>
#include <Grid/qcd/action/filters/DirichletFilter.h>
#include <Grid/qcd/action/filters/DDHMCFilter.h>
////////////////////////////////////////////
// Gauge Actions
////////////////////////////////////////////

View File

@ -37,32 +37,24 @@ NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams {
Coordinate twists;
Coordinate dirichlet; // Blocksize of dirichlet BCs
GparityWilsonImplParams() : twists(Nd, 0), dirichlet(Nd, 0) {};
GparityWilsonImplParams() : twists(Nd, 0) {};
};
struct WilsonImplParams {
bool overlapCommsCompute;
Coordinate dirichlet; // Blocksize of dirichlet BCs
AcceleratorVector<Real,Nd> twist_n_2pi_L;
AcceleratorVector<Complex,Nd> boundary_phases;
WilsonImplParams() {
dirichlet.resize(Nd,0);
boundary_phases.resize(Nd, 1.0);
twist_n_2pi_L.resize(Nd, 0.0);
};
WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
twist_n_2pi_L.resize(Nd, 0.0);
dirichlet.resize(Nd,0);
}
};
struct StaggeredImplParams {
Coordinate dirichlet; // Blocksize of dirichlet BCs
StaggeredImplParams()
{
dirichlet.resize(Nd,0);
};
StaggeredImplParams() {};
};
struct OneFlavourRationalParams : Serializable {

View File

@ -68,16 +68,9 @@ public:
///////////////////////////////////////////////////////////////
// Support for MADWF tricks
///////////////////////////////////////////////////////////////
RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
RealD MassPlus(void) { return mass_plus; };
RealD MassMinus(void) { return mass_minus; };
RealD Mass(void) { return mass; };
void SetMass(RealD _mass) {
mass_plus=mass_minus=_mass;
SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs
} ;
void SetMass(RealD _mass_plus, RealD _mass_minus) {
mass_plus=_mass_plus;
mass_minus=_mass_minus;
mass=_mass;
SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs
} ;
void P(const FermionField &psi, FermionField &chi);
@ -115,7 +108,7 @@ public:
void MeooeDag5D (const FermionField &in, FermionField &out);
// protected:
RealD mass_plus, mass_minus;
RealD mass;
// Save arguments to SetCoefficientsInternal
Vector<Coeff_t> _gamma;

View File

@ -1,435 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
////////////////////////////////////////////
// Standard Clover
// (4+m0) + csw * clover_term
// Exp Clover
// (4+m0) * exp(csw/(4+m0) clover_term)
// = (4+m0) + csw * clover_term + ...
////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
//////////////////////////////////
// Generic Standard Clover
//////////////////////////////////
template<class Impl>
class CloverHelpers: public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
typedef WilsonCloverHelpers<Impl> Helpers;
static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
GridBase *grid = CloverTerm.Grid();
CloverTerm += diag_mass;
int lvol = grid->lSites();
int DimRep = Impl::Dimension;
{
autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite);
thread_for(site, lvol, {
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
peekLocalSite(Qx, CTv, lcoor);
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++){
auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
EigenInvCloverOp = EigenCloverOp.inverse();
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
pokeLocalSite(Qxinv, CTIv, lcoor);
});
}
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
return Helpers::Cmunu(U, lambda, mu, nu);
}
};
//////////////////////////////////
// Generic Exp Clover
//////////////////////////////////
template<class Impl>
class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef WilsonCloverHelpers<Impl> Helpers;
// Can this be avoided?
static void IdentityTimesC(const CloverField& in, RealD c) {
int DimRep = Impl::Dimension;
autoView(in_v, in, AcceleratorWrite);
accelerator_for(ss, in.Grid()->oSites(), 1, {
for (int sa=0; sa<Ns; sa++)
for (int ca=0; ca<DimRep; ca++)
in_v[ss]()(sa,sa)(ca,ca) = c;
});
}
static int getNMAX(RealD prec, RealD R) {
/* compute stop condition for exponential */
int NMAX=1;
RealD cond=R*R/2.;
while (cond*std::exp(R)>prec) {
NMAX++;
cond*=R/(double)(NMAX+1);
}
return NMAX;
}
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
GridBase* grid = Clover.Grid();
CloverField ExpClover(grid);
int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
Clover *= (1.0/diag_mass);
// Taylor expansion, slow but generic
// Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
// qN = cN
// qn = cn + qn+1 X
std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0;
for (int i=1; i<=NMAX; i++)
cn[i] = cn[i-1] / RealD(i);
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * Clover + cn[i];
// prepare inverse
CloverInv = (-1.0)*Clover;
Clover = ExpClover * diag_mass;
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * CloverInv + cn[i];
CloverInv = ExpClover * (1.0/diag_mass);
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
return lambda;
}
};
//////////////////////////////////
// Compact Standard Clover
//////////////////////////////////
template<class Impl>
class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void MassTerm(CloverField& Clover, RealD diag_mass) {
Clover += diag_mass;
}
static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
CloverTriangleField& Triangle,
RealD csw_t, RealD diag_mass) {
// Do nothing
}
// TODO: implement Cmunu for better performances with compact layout, but don't do it
// here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
return Helpers::Cmunu(U, lambda, mu, nu);
}
};
//////////////////////////////////
// Compact Exp Clover
//////////////////////////////////
template<class Impl>
class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void MassTerm(CloverField& Clover, RealD diag_mass) {
// do nothing!
// mass term is multiplied to exp(Clover) below
}
static int getNMAX(RealD prec, RealD R) {
/* compute stop condition for exponential */
int NMAX=1;
RealD cond=R*R/2.;
while (cond*std::exp(R)>prec) {
NMAX++;
cond*=R/(double)(NMAX+1);
}
return NMAX;
}
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
typedef iMatrix<ComplexD,6> mat;
RealD qn[6];
RealD qnold[6];
RealD p[5];
RealD trA2, trA3, trA4;
mat A2, A3, A4, A5;
A2 = alpha * alpha * arg * arg;
A3 = alpha * arg * A2;
A4 = A2 * A2;
A5 = A2 * A3;
trA2 = toReal( trace(A2) );
trA3 = toReal( trace(A3) );
trA4 = toReal( trace(A4));
p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
p[3] = trA3 / 3.0;
p[4] = 0.5 * trA2;
qnold[0] = cN[Niter];
qnold[1] = 0.0;
qnold[2] = 0.0;
qnold[3] = 0.0;
qnold[4] = 0.0;
qnold[5] = 0.0;
for(int i = Niter-1; i >= 0; i--)
{
qn[0] = p[0] * qnold[5] + cN[i];
qn[1] = p[1] * qnold[5] + qnold[0];
qn[2] = p[2] * qnold[5] + qnold[1];
qn[3] = p[3] * qnold[5] + qnold[2];
qn[4] = p[4] * qnold[5] + qnold[3];
qn[5] = qnold[4];
qnold[0] = qn[0];
qnold[1] = qn[1];
qnold[2] = qn[2];
qnold[3] = qn[3];
qnold[4] = qn[4];
qnold[5] = qn[5];
}
mat unit(1.0);
dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
}
static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
GridBase* grid = Diagonal.Grid();
int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
//
// Implementation completely in Daniel's layout
//
// Taylor expansion with Cayley-Hamilton recursion
// underlying Horner scheme as above
std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0;
for (int i=1; i<=NMAX; i++){
cn[i] = cn[i-1] / RealD(i);
}
// Taken over from Daniel's implementation
conformable(Diagonal, Triangle);
long lsites = grid->lSites();
{
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
typedef iMatrix<ComplexD,6> mat;
autoView(diagonal_v, Diagonal, CpuRead);
autoView(triangle_v, Triangle, CpuRead);
autoView(diagonalExp_v, Diagonal, CpuWrite);
autoView(triangleExp_v, Triangle, CpuWrite);
thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
mat srcCloverOpUL(0.0); // upper left block
mat srcCloverOpLR(0.0); // lower right block
mat ExpCloverOp;
scalar_object_diagonal diagonal_tmp = Zero();
scalar_object_diagonal diagonal_exp_tmp = Zero();
scalar_object_triangle triangle_tmp = Zero();
scalar_object_triangle triangle_exp_tmp = Zero();
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
peekLocalSite(triangle_tmp, triangle_v, lcoor);
int block;
block = 0;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
}
else{
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
}
}
}
block = 1;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
}
else{
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
}
}
}
// exp(Clover)
ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
block = 0;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
}
else if(i < j){
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
}
}
}
ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
block = 1;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
}
else if(i < j){
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
}
}
}
pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
});
}
Diagonal *= diag_mass;
Triangle *= diag_mass;
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
return lambda;
}
};
NAMESPACE_END(Grid);

View File

@ -1,241 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
Copyright (C) 2020 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
//
// Modifications done here:
//
// Original: clover term = 12x12 matrix per site
//
// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
// Sufficient to store/transfer only the real parts of the diagonal and one triangular part
// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
//
// Here: Above but diagonal as complex numbers, i.e., need to store/transfer
// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
//
// Words per site and improvement compared to original (combined with the input and output spinors):
//
// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
// - Minimal: 2*12 + 36 = 60 words -> 2.80 x less
// - Here: 2*12 + 42 = 66 words -> 2.55 x less
//
// These improvements directly translate to wall-clock time
//
// Data layout:
//
// - diagonal and triangle part as separate lattice fields,
// this was faster than as 1 combined field on all tested machines
// - diagonal: as expected
// - triangle: store upper right triangle in row major order
// - graphical:
// 0 1 2 3 4
// 5 6 7 8
// 9 10 11 = upper right triangle indices
// 12 13
// 14
// 0
// 1
// 2
// 3 = diagonal indices
// 4
// 5
// 0
// 1 5
// 2 6 9 = lower left triangle indices
// 3 7 10 12
// 4 8 11 13 14
//
// Impact on total memory consumption:
// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
// - Here: (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts = 24 complex words per site
// + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts = 60 complex words per site
// = 84 complex words per site
template<class Impl, class CloverHelpers>
class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {
/////////////////////////////////////////////
// Sizes
/////////////////////////////////////////////
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
/////////////////////////////////////////////
// Type definitions
/////////////////////////////////////////////
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonFermion<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
/////////////////////////////////////////////
// Constructors
/////////////////////////////////////////////
public:
CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const RealD _cF = 1.0,
const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams& impl_p = ImplParams());
/////////////////////////////////////////////
// Member functions (implementing interface)
/////////////////////////////////////////////
public:
virtual void Instantiatable() {};
int ConstEE() override { return 0; };
int isTrivialEE() override { return 0; };
void Dhop(const FermionField& in, FermionField& out, int dag) override;
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
void M(const FermionField& in, FermionField& out) override;
void Mdag(const FermionField& in, FermionField& out) override;
void Meooe(const FermionField& in, FermionField& out) override;
void MeooeDag(const FermionField& in, FermionField& out) override;
void Mooee(const FermionField& in, FermionField& out) override;
void MooeeDag(const FermionField& in, FermionField& out) override;
void MooeeInv(const FermionField& in, FermionField& out) override;
void MooeeInvDag(const FermionField& in, FermionField& out) override;
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
/////////////////////////////////////////////
// Member functions (internals)
/////////////////////////////////////////////
void MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle);
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
void ImportGauge(const GaugeField& _Umu) override;
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
private:
template<class Field>
const MaskField* getCorrectMaskField(const Field &in) const {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
return &this->BoundaryMaskOdd;
} else {
return &this->BoundaryMaskEven;
}
} else {
return &this->BoundaryMask;
}
}
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
/////////////////////////////////////////////
// Member Data
/////////////////////////////////////////////
public:
RealD csw_r;
RealD csw_t;
RealD cF;
bool open_boundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
FermionField Tmp;
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
};
NAMESPACE_END(Grid);

View File

@ -53,7 +53,6 @@ NAMESPACE_CHECK(Wilson);
#include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
NAMESPACE_CHECK(WilsonTM);
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D);
@ -138,52 +137,21 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
// Clover fermions
template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
// Compact Clover fermions
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
// Domain Wall fermions
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;

View File

@ -49,8 +49,6 @@ public:
virtual FermionField &tmp(void) = 0;
virtual void DirichletBlock(Coordinate & _Block) { assert(0); };
GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know
GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

View File

@ -4,11 +4,10 @@
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
Copyright (C) 2017 - 2022
Copyright (C) 2017
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: David Preti <>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -30,9 +29,7 @@
#pragma once
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
#include <Grid/Grid.h>
NAMESPACE_BEGIN(Grid);
@ -52,16 +49,19 @@ NAMESPACE_BEGIN(Grid);
// csw_r = csw_t to recover the isotropic version
//////////////////////////////////////////////////////////////////
template<class Impl, class CloverHelpers>
class WilsonCloverFermion : public WilsonFermion<Impl>,
public WilsonCloverHelpers<Impl>
template <class Impl>
class WilsonCloverFermion : public WilsonFermion<Impl>
{
public:
// Types definitions
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
template <typename vtype>
using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef iImplClover<Simd> SiteCloverType;
typedef Lattice<SiteCloverType> CloverFieldType;
typedef WilsonFermion<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
public:
typedef WilsonFermion<Impl> WilsonBase;
virtual int ConstEE(void) { return 0; };
virtual void Instantiatable(void){};
@ -72,7 +72,42 @@ public:
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams &impl_p = ImplParams());
const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
Fgrid,
Hgrid,
_mass, impl_p, clover_anisotropy),
CloverTerm(&Fgrid),
CloverTermInv(&Fgrid),
CloverTermEven(&Hgrid),
CloverTermOdd(&Hgrid),
CloverTermInvEven(&Hgrid),
CloverTermInvOdd(&Hgrid),
CloverTermDagEven(&Hgrid),
CloverTermDagOdd(&Hgrid),
CloverTermInvDagEven(&Hgrid),
CloverTermInvDagOdd(&Hgrid)
{
assert(Nd == 4); // require 4 dimensions
if (clover_anisotropy.isAnisotropic)
{
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
}
else
{
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
}
csw_t = _csw_t * 0.5;
if (csw_r == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
if (csw_t == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
ImportGauge(_Umu);
}
virtual void M(const FermionField &in, FermionField &out);
virtual void Mdag(const FermionField &in, FermionField &out);
@ -89,21 +124,250 @@ public:
void ImportGauge(const GaugeField &_Umu);
// Derivative parts unpreconditioned pseudofermions
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
public:
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
{
conformable(lambda.Grid(), U[0].Grid());
GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
// insertion in upper staple
// please check redundancy of shift operations
// C1+
tmp = lambda * U[nu];
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C2+
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C3+
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
// C4+
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
// insertion in lower staple
// C1-
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C2-
tmp = adj(lambda) * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C3-
tmp = lambda * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
// C4-
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
return out;
}
protected:
// here fixing the 4 dimensions, make it more general?
RealD csw_r; // Clover coefficient - spatial
RealD csw_t; // Clover coefficient - temporal
RealD diag_mass; // Mass term
CloverField CloverTerm, CloverTermInv; // Clover term
CloverField CloverTermEven, CloverTermOdd; // Clover term EO
CloverField CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
CloverField CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverField CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
};
CloverFieldType CloverTerm, CloverTermInv; // Clover term
CloverFieldType CloverTermEven, CloverTermOdd; // Clover term EO
CloverFieldType CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
public:
// eventually these can be compressed into 6x6 blocks instead of the 12x12
// using the DeGrand-Rossi basis for the gamma matrices
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverXZ(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v, T,AcceleratorWrite);
autoView(F_v, F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = -F_v[i]()();
T_v[i]()(1, 0) = F_v[i]()();
T_v[i]()(2, 3) = -F_v[i]()();
T_v[i]()(3, 2) = F_v[i]()();
});
return T;
}
CloverFieldType fillCloverXY(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
T_v[i]()(1, 1) = timesI(F_v[i]()());
T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
T_v[i]()(3, 3) = timesI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverXT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v , T, AcceleratorWrite);
autoView( F_v , F, AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = timesI(F_v[i]()());
T_v[i]()(1, 0) = timesI(F_v[i]()());
T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverYT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v ,T,AcceleratorWrite);
autoView( F_v ,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = -(F_v[i]()());
T_v[i]()(1, 0) = (F_v[i]()());
T_v[i]()(2, 3) = (F_v[i]()());
T_v[i]()(3, 2) = -(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverZT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v , T,AcceleratorWrite);
autoView( F_v , F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 0) = timesI(F_v[i]()());
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
T_v[i]()(3, 3) = timesI(F_v[i]()());
});
return T;
}
};
NAMESPACE_END(Grid);

View File

@ -1,763 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
Copyright (C) 2021 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
// Helper routines that implement common clover functionality
NAMESPACE_BEGIN(Grid);
template<class Impl> class WilsonCloverHelpers {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
{
conformable(lambda.Grid(), U[0].Grid());
GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
// insertion in upper staple
// please check redundancy of shift operations
// C1+
tmp = lambda * U[nu];
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C2+
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C3+
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
// C4+
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
// insertion in lower staple
// C1-
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C2-
tmp = adj(lambda) * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C3-
tmp = lambda * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
// C4-
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
return out;
}
static CloverField fillCloverYZ(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverXZ(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v, T,AcceleratorWrite);
autoView(F_v, F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
});
return T;
}
static CloverField fillCloverXY(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverXT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v , T, AcceleratorWrite);
autoView( F_v , F, AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverYT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v ,T,AcceleratorWrite);
autoView( F_v ,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
});
return T;
}
static CloverField fillCloverZT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v , T,AcceleratorWrite);
autoView( F_v , F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
});
return T;
}
template<class _Spinor>
static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
auto CC = coalescedRead(C);
mult(&phi, &CC, &chi);
}
template<class _SpinorField>
inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
const int Nsimd = SiteSpinor::Nsimd();
autoView(out_v, out, AcceleratorWrite);
autoView(phi_v, phi, AcceleratorRead);
autoView(C_v, C, AcceleratorRead);
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
calcSpinor tmp;
multClover(tmp,C_v[sss],phi_v(sss));
coalescedWrite(out_v[sss],tmp);
});
}
};
////////////////////////////////////////////////////////
template<class Impl> class CompactWilsonCloverHelpers {
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
#if 0
static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
assert(i != j);
if(i < j) {
return triangle()(block)(triangle_index(i, j));
} else { // i > j
return conjugate(triangle()(block)(triangle_index(i, j)));
}
}
#else
template<typename vobj>
static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
assert(i != j);
if(i < j) {
return triangle()(block)(triangle_index(i, j));
} else { // i > j
return conjugate(triangle()(block)(triangle_index(i, j)));
}
}
#endif
static accelerator_inline int triangle_index(int i, int j) {
if(i == j)
return 0;
else if(i < j)
return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
else // i > j
return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
}
static void MooeeKernel_gpu(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
autoView(diagonal_v, diagonal, AcceleratorRead);
autoView(triangle_v, triangle, AcceleratorRead);
autoView(in_v, in, AcceleratorRead);
autoView(out_v, out, AcceleratorWrite);
typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
const uint64_t NN = Nsite * Ls;
accelerator_for(ss, NN, Simd::Nsimd(), {
int sF = ss;
int sU = ss/Ls;
CalcSpinor res;
CalcSpinor in_t = in_v(sF);
auto diagonal_t = diagonal_v(sU);
auto triangle_t = triangle_v(sU);
for(int block=0; block<Nhs; block++) {
int s_start = block*Nhs;
for(int i=0; i<Nred; i++) {
int si = s_start + i/Nc, ci = i%Nc;
res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
for(int j=0; j<Nred; j++) {
if (j == i) continue;
int sj = s_start + j/Nc, cj = j%Nc;
res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
};
};
};
coalescedWrite(out_v[sF], res);
});
}
static void MooeeKernel_cpu(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
autoView(diagonal_v, diagonal, CpuRead);
autoView(triangle_v, triangle, CpuRead);
autoView(in_v, in, CpuRead);
autoView(out_v, out, CpuWrite);
typedef SiteSpinor CalcSpinor;
#if defined(A64FX) || defined(A64FXFIXEDSIZE)
#define PREFETCH_CLOVER(BASE) { \
uint64_t base; \
int pf_dist_L1 = 1; \
int pf_dist_L2 = -5; /* -> penalty -> disable */ \
\
if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
} \
\
if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
} \
}
// TODO: Implement/generalize this for other architectures
// I played around a bit on KNL (see below) but didn't bring anything
// #elif defined(AVX512)
// #define PREFETCH_CLOVER(BASE) { \
// uint64_t base; \
// int pf_dist_L1 = 1; \
// int pf_dist_L2 = +4; \
// \
// if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
// base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
// _mm_prefetch((const char*)(base + 0), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 64), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 128), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 192), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 256), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 320), _MM_HINT_T0); \
// } \
// \
// if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
// base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
// _mm_prefetch((const char*)(base + 0), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 64), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 128), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 192), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 256), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 320), _MM_HINT_T1); \
// } \
// }
#else
#define PREFETCH_CLOVER(BASE)
#endif
const uint64_t NN = Nsite * Ls;
thread_for(ss, NN, {
int sF = ss;
int sU = ss/Ls;
CalcSpinor res;
CalcSpinor in_t = in_v[sF];
auto diag_t = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
auto triangle_t = triangle_v[sU];
// upper half
PREFETCH_CLOVER(0);
auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
auto in_cc_1_0 = conjugate(in_t()(1)(0));
auto in_cc_1_1 = conjugate(in_t()(1)(1));
res()(0)(0) = diag_t()(0)( 0) * in_t()(0)(0)
+ triangle_t()(0)( 0) * in_t()(0)(1)
+ triangle_t()(0)( 1) * in_t()(0)(2)
+ triangle_t()(0)( 2) * in_t()(1)(0)
+ triangle_t()(0)( 3) * in_t()(1)(1)
+ triangle_t()(0)( 4) * in_t()(1)(2);
res()(0)(1) = triangle_t()(0)( 0) * in_cc_0_0;
res()(0)(1) = diag_t()(0)( 1) * in_t()(0)(1)
+ triangle_t()(0)( 5) * in_t()(0)(2)
+ triangle_t()(0)( 6) * in_t()(1)(0)
+ triangle_t()(0)( 7) * in_t()(1)(1)
+ triangle_t()(0)( 8) * in_t()(1)(2)
+ conjugate( res()(0)( 1));
res()(0)(2) = triangle_t()(0)( 1) * in_cc_0_0
+ triangle_t()(0)( 5) * in_cc_0_1;
res()(0)(2) = diag_t()(0)( 2) * in_t()(0)(2)
+ triangle_t()(0)( 9) * in_t()(1)(0)
+ triangle_t()(0)(10) * in_t()(1)(1)
+ triangle_t()(0)(11) * in_t()(1)(2)
+ conjugate( res()(0)( 2));
res()(1)(0) = triangle_t()(0)( 2) * in_cc_0_0
+ triangle_t()(0)( 6) * in_cc_0_1
+ triangle_t()(0)( 9) * in_cc_0_2;
res()(1)(0) = diag_t()(0)( 3) * in_t()(1)(0)
+ triangle_t()(0)(12) * in_t()(1)(1)
+ triangle_t()(0)(13) * in_t()(1)(2)
+ conjugate( res()(1)( 0));
res()(1)(1) = triangle_t()(0)( 3) * in_cc_0_0
+ triangle_t()(0)( 7) * in_cc_0_1
+ triangle_t()(0)(10) * in_cc_0_2
+ triangle_t()(0)(12) * in_cc_1_0;
res()(1)(1) = diag_t()(0)( 4) * in_t()(1)(1)
+ triangle_t()(0)(14) * in_t()(1)(2)
+ conjugate( res()(1)( 1));
res()(1)(2) = triangle_t()(0)( 4) * in_cc_0_0
+ triangle_t()(0)( 8) * in_cc_0_1
+ triangle_t()(0)(11) * in_cc_0_2
+ triangle_t()(0)(13) * in_cc_1_0
+ triangle_t()(0)(14) * in_cc_1_1;
res()(1)(2) = diag_t()(0)( 5) * in_t()(1)(2)
+ conjugate( res()(1)( 2));
vstream(out_v[sF]()(0)(0), res()(0)(0));
vstream(out_v[sF]()(0)(1), res()(0)(1));
vstream(out_v[sF]()(0)(2), res()(0)(2));
vstream(out_v[sF]()(1)(0), res()(1)(0));
vstream(out_v[sF]()(1)(1), res()(1)(1));
vstream(out_v[sF]()(1)(2), res()(1)(2));
// lower half
PREFETCH_CLOVER(1);
auto in_cc_2_0 = conjugate(in_t()(2)(0));
auto in_cc_2_1 = conjugate(in_t()(2)(1));
auto in_cc_2_2 = conjugate(in_t()(2)(2));
auto in_cc_3_0 = conjugate(in_t()(3)(0));
auto in_cc_3_1 = conjugate(in_t()(3)(1));
res()(2)(0) = diag_t()(1)( 0) * in_t()(2)(0)
+ triangle_t()(1)( 0) * in_t()(2)(1)
+ triangle_t()(1)( 1) * in_t()(2)(2)
+ triangle_t()(1)( 2) * in_t()(3)(0)
+ triangle_t()(1)( 3) * in_t()(3)(1)
+ triangle_t()(1)( 4) * in_t()(3)(2);
res()(2)(1) = triangle_t()(1)( 0) * in_cc_2_0;
res()(2)(1) = diag_t()(1)( 1) * in_t()(2)(1)
+ triangle_t()(1)( 5) * in_t()(2)(2)
+ triangle_t()(1)( 6) * in_t()(3)(0)
+ triangle_t()(1)( 7) * in_t()(3)(1)
+ triangle_t()(1)( 8) * in_t()(3)(2)
+ conjugate( res()(2)( 1));
res()(2)(2) = triangle_t()(1)( 1) * in_cc_2_0
+ triangle_t()(1)( 5) * in_cc_2_1;
res()(2)(2) = diag_t()(1)( 2) * in_t()(2)(2)
+ triangle_t()(1)( 9) * in_t()(3)(0)
+ triangle_t()(1)(10) * in_t()(3)(1)
+ triangle_t()(1)(11) * in_t()(3)(2)
+ conjugate( res()(2)( 2));
res()(3)(0) = triangle_t()(1)( 2) * in_cc_2_0
+ triangle_t()(1)( 6) * in_cc_2_1
+ triangle_t()(1)( 9) * in_cc_2_2;
res()(3)(0) = diag_t()(1)( 3) * in_t()(3)(0)
+ triangle_t()(1)(12) * in_t()(3)(1)
+ triangle_t()(1)(13) * in_t()(3)(2)
+ conjugate( res()(3)( 0));
res()(3)(1) = triangle_t()(1)( 3) * in_cc_2_0
+ triangle_t()(1)( 7) * in_cc_2_1
+ triangle_t()(1)(10) * in_cc_2_2
+ triangle_t()(1)(12) * in_cc_3_0;
res()(3)(1) = diag_t()(1)( 4) * in_t()(3)(1)
+ triangle_t()(1)(14) * in_t()(3)(2)
+ conjugate( res()(3)( 1));
res()(3)(2) = triangle_t()(1)( 4) * in_cc_2_0
+ triangle_t()(1)( 8) * in_cc_2_1
+ triangle_t()(1)(11) * in_cc_2_2
+ triangle_t()(1)(13) * in_cc_3_0
+ triangle_t()(1)(14) * in_cc_3_1;
res()(3)(2) = diag_t()(1)( 5) * in_t()(3)(2)
+ conjugate( res()(3)( 2));
vstream(out_v[sF]()(2)(0), res()(2)(0));
vstream(out_v[sF]()(2)(1), res()(2)(1));
vstream(out_v[sF]()(2)(2), res()(2)(2));
vstream(out_v[sF]()(3)(0), res()(3)(0));
vstream(out_v[sF]()(3)(1), res()(3)(1));
vstream(out_v[sF]()(3)(2), res()(3)(2));
});
}
static void MooeeKernel(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
#if defined(GRID_CUDA) || defined(GRID_HIP)
MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
#else
MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
#endif
}
static void Invert(const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv) {
conformable(diagonal, diagonalInv);
conformable(triangle, triangleInv);
conformable(diagonal, triangle);
diagonalInv.Checkerboard() = diagonal.Checkerboard();
triangleInv.Checkerboard() = triangle.Checkerboard();
GridBase* grid = diagonal.Grid();
long lsites = grid->lSites();
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
autoView(diagonal_v, diagonal, CpuRead);
autoView(triangle_v, triangle, CpuRead);
autoView(diagonalInv_v, diagonalInv, CpuWrite);
autoView(triangleInv_v, triangleInv, CpuWrite);
thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
scalar_object_diagonal diagonal_tmp = Zero();
scalar_object_diagonal diagonal_inv_tmp = Zero();
scalar_object_triangle triangle_tmp = Zero();
scalar_object_triangle triangle_inv_tmp = Zero();
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
peekLocalSite(triangle_tmp, triangle_v, lcoor);
// TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
for (long s_row=0;s_row<Ns;s_row++) {
for (long s_col=0;s_col<Ns;s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for (long c_row=0;c_row<Nc;c_row++) {
for (long c_col=0;c_col<Nc;c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
else
clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
}
}
}
}
clover_inv_eigen = clover_eigen.inverse();
for (long s_row=0;s_row<Ns;s_row++) {
for (long s_col=0;s_col<Ns;s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for (long c_row=0;c_row<Nc;c_row++) {
for (long c_col=0;c_col<Nc;c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
else if(i < j)
triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
else
continue;
}
}
}
}
pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
});
}
static void ConvertLayout(const CloverField& full,
CloverDiagonalField& diagonal,
CloverTriangleField& triangle) {
conformable(full, diagonal);
conformable(full, triangle);
diagonal.Checkerboard() = full.Checkerboard();
triangle.Checkerboard() = full.Checkerboard();
autoView(full_v, full, AcceleratorRead);
autoView(diagonal_v, diagonal, AcceleratorWrite);
autoView(triangle_v, triangle, AcceleratorWrite);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, full.Grid()->oSites(), 1, {
for(int s_row = 0; s_row < Ns; s_row++) {
for(int s_col = 0; s_col < Ns; s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for(int c_row = 0; c_row < Nc; c_row++) {
for(int c_col = 0; c_col < Nc; c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
else if(i < j)
triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
else
continue;
}
}
}
}
});
}
static void ConvertLayout(const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverField& full) {
conformable(full, diagonal);
conformable(full, triangle);
full.Checkerboard() = diagonal.Checkerboard();
full = Zero();
autoView(diagonal_v, diagonal, AcceleratorRead);
autoView(triangle_v, triangle, AcceleratorRead);
autoView(full_v, full, AcceleratorWrite);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, full.Grid()->oSites(), 1, {
for(int s_row = 0; s_row < Ns; s_row++) {
for(int s_col = 0; s_col < Ns; s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for(int c_row = 0; c_row < Nc; c_row++) {
for(int c_col = 0; c_col < Nc; c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
else
full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
}
}
}
}
});
}
static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
// Checks/grid
double t0 = usecond();
conformable(diagonal, triangle);
GridBase* grid = diagonal.Grid();
// Determine the boundary coordinates/sites
double t1 = usecond();
int t_dir = Nd - 1;
Lattice<iScalar<vInteger>> t_coor(grid);
LatticeCoordinate(t_coor, t_dir);
int T = grid->GlobalDimensions()[t_dir];
// Set off-diagonal parts at boundary to zero -- OK
double t2 = usecond();
CloverTriangleField zeroTriangle(grid);
zeroTriangle.Checkerboard() = triangle.Checkerboard();
zeroTriangle = Zero();
triangle = where(t_coor == 0, zeroTriangle, triangle);
triangle = where(t_coor == T-1, zeroTriangle, triangle);
// Set diagonal to unity (scaled correctly) -- OK
double t3 = usecond();
CloverDiagonalField tmp(grid);
tmp.Checkerboard() = diagonal.Checkerboard();
tmp = -1.0 * csw_t + diag_mass;
diagonal = where(t_coor == 0, tmp, diagonal);
diagonal = where(t_coor == T-1, tmp, diagonal);
// Correct values next to boundary
double t4 = usecond();
if(cF != 1.0) {
tmp = cF - 1.0;
tmp += diagonal;
diagonal = where(t_coor == 1, tmp, diagonal);
diagonal = where(t_coor == T-2, tmp, diagonal);
}
// Report timings
double t5 = usecond();
#if 0
std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
<< " checks = " << (t1 - t0) / 1e6
<< ", coordinate = " << (t2 - t1) / 1e6
<< ", off-diag zero = " << (t3 - t2) / 1e6
<< ", diagonal unity = " << (t4 - t3) / 1e6
<< ", near-boundary = " << (t5 - t4) / 1e6
<< ", total = " << (t5 - t0) / 1e6
<< std::endl;
#endif
}
template<class Field, class Mask>
static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
conformable(f, m);
auto grid = f.Grid();
const uint32_t Nsite = grid->oSites();
const uint32_t Nsimd = grid->Nsimd();
autoView(f_v, f, AcceleratorWrite);
autoView(m_v, m, AcceleratorRead);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, Nsite, Nsimd, {
coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
});
}
template<class MaskField>
static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
assert(odd.Grid()->_isCheckerBoarded && odd.Checkerboard() == Odd);
assert(!full.Grid()->_isCheckerBoarded);
GridBase* grid = full.Grid();
int t_dir = Nd-1;
Lattice<iScalar<vInteger>> t_coor(grid);
LatticeCoordinate(t_coor, t_dir);
int T = grid->GlobalDimensions()[t_dir];
MaskField zeroMask(grid); zeroMask = Zero();
full = 1.0;
full = where(t_coor == 0, zeroMask, full);
full = where(t_coor == T-1, zeroMask, full);
pickCheckerboard(Even, even, full);
pickCheckerboard(Odd, odd, full);
}
};
NAMESPACE_END(Grid);

View File

@ -1,90 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
Copyright (C) 2021 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Impl>
class WilsonCloverTypes {
public:
INHERIT_IMPL_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef iImplClover<Simd> SiteClover;
typedef Lattice<SiteClover> CloverField;
};
template<class Impl>
class CompactWilsonCloverTypes {
public:
INHERIT_IMPL_TYPES(Impl);
static constexpr int Nred = Nc * Nhs; // 6
static constexpr int Nblock = Nhs; // 2
static constexpr int Ndiagonal = Nred; // 6
static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
typedef iSinglet<Simd> SiteMask;
typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
typedef Lattice<SiteCloverTriangle> CloverTriangleField;
typedef Lattice<SiteMask> MaskField;
};
#define INHERIT_CLOVER_TYPES(Impl) \
typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
#define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal SiteCloverDiagonal; \
typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle SiteCloverTriangle; \
typedef typename CompactWilsonCloverTypes<Impl>::SiteMask SiteMask; \
typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
typedef typename CompactWilsonCloverTypes<Impl>::MaskField MaskField; \
/* ugly duplication but needed inside functionality classes */ \
template<typename vtype> using iImplCloverDiagonal = \
iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
template<typename vtype> using iImplCloverTriangle = \
iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
#define INHERIT_COMPACT_CLOVER_SIZES(Impl) \
static constexpr int Nred = CompactWilsonCloverTypes<Impl>::Nred; \
static constexpr int Nblock = CompactWilsonCloverTypes<Impl>::Nblock; \
static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
NAMESPACE_END(Grid);

View File

@ -75,10 +75,6 @@ public:
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
int Dirichlet;
Coordinate Block;
/********** Deprecate timers **********/
void Report(void);
void ZeroCounters(void);
double DhopCalls;
@ -177,18 +173,7 @@ public:
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
double _M5,const ImplParams &p= ImplParams());
virtual void DirichletBlock(Coordinate & block)
{
assert(block.size()==Nd+1);
if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
Dirichlet = 1;
Block = block;
Stencil.DirichletBlock(block);
StencilEven.DirichletBlock(block);
StencilOdd.DirichletBlock(block);
}
}
// Constructors
/*
WilsonFermion5D(int simd,

View File

@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
FiveDimRedBlackGrid,
FourDimGrid,
FourDimRedBlackGrid,_M5,p),
mass_plus(_mass), mass_minus(_mass)
mass(_mass)
{
}
@ -209,8 +209,8 @@ void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
Vector<Coeff_t> diag (Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass;
M5D(psi,chi,chi,lower,diag,upper);
}
template<class Impl>
@ -220,8 +220,8 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
Vector<Coeff_t> diag = bs;
Vector<Coeff_t> upper= cs;
Vector<Coeff_t> lower= cs;
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
M5D(psi,psi,Din,lower,diag,upper);
}
// FIXME Redunant with the above routine; check this and eliminate
@ -235,8 +235,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &
upper[i]=-ceo[i];
lower[i]=-ceo[i];
}
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
M5D(psi,psi,chi,lower,diag,upper);
}
template<class Impl>
@ -250,8 +250,8 @@ void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &
upper[i]=-cee[i];
lower[i]=-cee[i];
}
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
M5D(psi,psi,chi,lower,diag,upper);
}
template<class Impl>
@ -266,9 +266,9 @@ void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &
// Assemble the 5d matrix
if ( s==0 ) {
upper[s] = -cee[s+1] ;
lower[s] = mass_minus*cee[Ls-1];
lower[s] = mass*cee[Ls-1];
} else if ( s==(Ls-1)) {
upper[s] = mass_plus*cee[0];
upper[s] = mass*cee[0];
lower[s] = -cee[s-1];
} else {
upper[s]=-cee[s+1];
@ -291,8 +291,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
Vector<Coeff_t> diag(Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0);
Vector<Coeff_t> lower(Ls,-1.0);
upper[Ls-1]=-mass_plus*upper[Ls-1];
lower[0] =-mass_minus*lower[0];
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
M5Ddag(psi,chi,chi,lower,diag,upper);
}
@ -307,9 +307,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
for (int s=0;s<Ls;s++){
if ( s== 0 ) {
upper[s] = cs[s+1];
lower[s] =-mass_minus*cs[Ls-1];
lower[s] =-mass*cs[Ls-1];
} else if ( s==(Ls-1) ) {
upper[s] =-mass_plus*cs[0];
upper[s] =-mass*cs[0];
lower[s] = cs[s-1];
} else {
upper[s] = cs[s+1];
@ -552,7 +552,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
leem[i]=mass_minus*cee[Ls-1]/bee[0];
leem[i]=mass*cee[Ls-1]/bee[0];
for(int j=0;j<i;j++) {
assert(bee[j+1]!=Coeff_t(0.0));
leem[i]*= aee[j]/bee[j+1];
@ -560,7 +560,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
ueem[i]=mass_plus;
ueem[i]=mass;
for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
ueem[i]*= aee[0]/bee[0];
@ -573,7 +573,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
}
{
Coeff_t delta_d=mass_minus*cee[Ls-1];
Coeff_t delta_d=mass*cee[Ls-1];
for(int j=0;j<Ls-1;j++) {
assert(bee[j] != Coeff_t(0.0));
delta_d *= cee[j]/bee[j];
@ -642,10 +642,6 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
assert(mass_plus == mass_minus);
RealD mass = mass_plus;
#if (!defined(GRID_HIP))
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
@ -781,8 +777,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
assert(mu>=0);
assert(mu<Nd);
assert(mass_plus == mass_minus);
RealD mass = mass_plus;
#if 0
int tshift = (mu == Nd-1) ? 1 : 0;

View File

@ -1,373 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
NAMESPACE_BEGIN(Grid);
template<class Impl, class CloverHelpers>
CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const RealD _cF,
const WilsonAnisotropyCoefficients& clover_anisotropy,
const ImplParams& impl_p)
: WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
, csw_r(_csw_r)
, csw_t(_csw_t)
, cF(_cF)
, open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, Diagonal(&Fgrid), Triangle(&Fgrid)
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
, DiagonalInv(&Fgrid), TriangleInv(&Fgrid)
, DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
, DiagonalInvOdd(&Hgrid), TriangleInvOdd(&Hgrid)
, Tmp(&Fgrid)
, BoundaryMask(&Fgrid)
, BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
{
assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
csw_r *= 0.5;
csw_t *= 0.5;
if (clover_anisotropy.isAnisotropic)
csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu);
if (open_boundaries) {
this->BoundaryMaskEven.Checkerboard() = Even;
this->BoundaryMaskOdd.Checkerboard() = Odd;
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp);
if(this->open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out);
if(this->open_boundaries) {
for(auto& o : out) ApplyBoundaryMask(o);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
} else {
MooeeInternal(in, out, DiagonalEven, TriangleEven);
}
} else {
MooeeInternal(in, out, Diagonal, Triangle);
}
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
Mooee(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
} else {
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
}
} else {
MooeeInternal(in, out, DiagonalInv, TriangleInv);
}
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
MooeeInv(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
DhopDirAll(in, out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!open_boundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
out.Checkerboard() = in.Checkerboard();
conformable(in, out);
conformable(in, diagonal);
conformable(in, triangle);
CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
// NOTE: parts copied from original implementation
// Import gauge into base class
double t0 = usecond();
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
// Initialize temporary variables
double t1 = usecond();
conformable(_Umu.Grid(), this->GaugeGrid());
GridBase* grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
CloverField TmpOriginal(grid);
// Compute the field strength terms mu>nu
double t2 = usecond();
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
double t3 = usecond();
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
// Handle mass term based on clover policy
CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
// Convert the data layout of the clover term
double t4 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Exponentiate the clover (nothing happens in case of the standard clover)
double t5 = usecond();
CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
// Possible modify the boundary values
double t6 = usecond();
if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
// Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
double t7 = usecond();
CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
// Fill the remaining clover fields
double t8 = usecond();
pickCheckerboard(Even, DiagonalEven, Diagonal);
pickCheckerboard(Even, TriangleEven, Triangle);
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
pickCheckerboard(Odd, TriangleOdd, Triangle);
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
// Report timings
double t9 = usecond();
std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "exponentiation = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "inversions = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
}
NAMESPACE_END(Grid);

View File

@ -2,13 +2,12 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
Copyright (C) 2017 - 2022
Copyright (C) 2017
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -34,48 +33,9 @@
NAMESPACE_BEGIN(Grid);
template<class Impl, class CloverHelpers>
WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const WilsonAnisotropyCoefficients& clover_anisotropy,
const ImplParams& impl_p)
: WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
, CloverTerm(&Fgrid)
, CloverTermInv(&Fgrid)
, CloverTermEven(&Hgrid)
, CloverTermOdd(&Hgrid)
, CloverTermInvEven(&Hgrid)
, CloverTermInvOdd(&Hgrid)
, CloverTermDagEven(&Hgrid)
, CloverTermDagOdd(&Hgrid)
, CloverTermInvDagEven(&Hgrid)
, CloverTermInvDagOdd(&Hgrid) {
assert(Nd == 4); // require 4 dimensions
if(clover_anisotropy.isAnisotropic) {
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
} else {
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
}
csw_t = _csw_t * 0.5;
if(csw_r == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
if(csw_t == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
ImportGauge(_Umu);
}
// *NOT* EO
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
template <class Impl>
void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
{
FermionField temp(out.Grid());
@ -89,8 +49,8 @@ void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, Fermion
out += temp;
}
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
template <class Impl>
void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
{
FermionField temp(out.Grid());
@ -104,16 +64,13 @@ void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, Ferm
out += temp;
}
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
template <class Impl>
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
{
double t0 = usecond();
WilsonFermion<Impl>::ImportGauge(_Umu);
double t1 = usecond();
GridBase *grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
double t2 = usecond();
// Compute the field strength terms mu>nu
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@ -122,20 +79,52 @@ void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Um
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
double t3 = usecond();
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
CloverTerm = Helpers::fillCloverYZ(Bx) * csw_r;
CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
double t4 = usecond();
CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
CloverTerm = fillCloverYZ(Bx) * csw_r;
CloverTerm += fillCloverXZ(By) * csw_r;
CloverTerm += fillCloverXY(Bz) * csw_r;
CloverTerm += fillCloverXT(Ex) * csw_t;
CloverTerm += fillCloverYT(Ey) * csw_t;
CloverTerm += fillCloverZT(Ez) * csw_t;
CloverTerm += diag_mass;
int lvol = _Umu.Grid()->lSites();
int DimRep = Impl::Dimension;
{
autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite);
thread_for(site, lvol, {
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
peekLocalSite(Qx, CTv, lcoor);
//if (csw!=0){
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++){
auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
EigenInvCloverOp = EigenCloverOp.inverse();
//std::cout << EigenInvCloverOp << std::endl;
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// }
pokeLocalSite(Qxinv, CTIv, lcoor);
});
}
double t5 = usecond();
// Separate the even and odd parts
pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@ -148,47 +137,37 @@ void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Um
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
double t6 = usecond();
std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiation = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t6 - t0) / 1e6 << std::endl;
}
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
template <class Impl>
void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseNo);
}
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseNo);
}
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseYes);
}
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseYes);
}
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
{
out.Checkerboard() = in.Checkerboard();
CloverField *Clover;
CloverFieldType *Clover;
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
if (dag)
@ -203,12 +182,12 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField
{
Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
}
Helpers::multCloverField(out, *Clover, in);
out = *Clover * in;
}
else
{
Clover = (inv) ? &CloverTermInv : &CloverTerm;
Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
out = adj(*Clover) * in;
}
}
else
@ -226,109 +205,29 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField
// std::cout << "Calling clover term Even" << std::endl;
Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
}
Helpers::multCloverField(out, *Clover, in);
out = *Clover * in;
// std::cout << GridLogMessage << "*Clover.Checkerboard() " << (*Clover).Checkerboard() << std::endl;
}
else
{
Clover = (inv) ? &CloverTermInv : &CloverTerm;
Helpers::multCloverField(out, *Clover, in);
out = *Clover * in;
}
}
} // MooeeInternal
// Derivative parts unpreconditioned pseudofermions
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
// Derivative parts
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
template <class Impl>
void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
{
assert(0);
}
// Derivative parts
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
template <class Impl>
void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
assert(0); // not implemented yet
}

View File

@ -60,8 +60,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
UmuOdd (_FourDimRedBlackGrid),
Lebesgue(_FourDimGrid),
LebesgueEvenOdd(_FourDimRedBlackGrid),
_tmp(&FiveDimRedBlackGrid),
Dirichlet(0)
_tmp(&FiveDimRedBlackGrid)
{
// some assertions
assert(FiveDimGrid._ndimension==5);
@ -219,14 +218,6 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
{
GaugeField HUmu(_Umu.Grid());
HUmu = _Umu*(-0.5);
if ( Dirichlet ) {
std::cout << GridLogMessage << " Dirichlet BCs 5d " <<Block<<std::endl;
Coordinate GaugeBlock(Nd);
for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
std::cout << GridLogMessage << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
DirichletFilter<GaugeField> Filter(GaugeBlock);
Filter.applyFilter(HUmu);
}
Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
pickCheckerboard(Even,UmuEven,Umu);
pickCheckerboard(Odd ,UmuOdd,Umu);

View File

@ -4,13 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
Copyright (C) 2022
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -600,47 +599,11 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
if(curr_type != Current::Vector)
{
std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
exit(1);
}
Gamma g5(Gamma::Algebra::Gamma5);
conformable(_grid, q_in_1.Grid());
conformable(_grid, q_in_2.Grid());
conformable(_grid, q_out.Grid());
auto UGrid= this->GaugeGrid();
PropagatorField tmp_shifted(UGrid);
PropagatorField g5Lg5(UGrid);
PropagatorField R(UGrid);
PropagatorField gmuR(UGrid);
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
};
Gamma gmu=Gamma(Gmu[mu]);
g5Lg5=g5*q_in_1*g5;
tmp_shifted=Cshift(q_in_2,mu,1);
Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
gmuR=gmu*R;
q_out=adj(g5Lg5)*R;
q_out-=adj(g5Lg5)*gmuR;
tmp_shifted=Cshift(q_in_1,mu,1);
Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
g5Lg5=g5*g5Lg5*g5;
R=q_in_2;
gmuR=gmu*R;
q_out-=adj(g5Lg5)*R;
q_out-=adj(g5Lg5)*gmuR;
assert(0);
}
@ -654,51 +617,9 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
if(curr_type != Current::Vector)
{
std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
exit(1);
}
int tshift = (mu == Nd-1) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
conformable(_grid, q_in.Grid());
conformable(_grid, q_out.Grid());
auto UGrid= this->GaugeGrid();
PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid);
PropagatorField L(UGrid);
PropagatorField zz (UGrid);
zz=Zero();
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
};
Gamma gmu=Gamma(Gmu[mu]);
tmp = Cshift(q_in,mu,1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
tmp = q_in *lattice_cmplx;
tmp = Cshift(tmp,mu,-1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
tmp = -( Utmp + gmu*Utmp );
// Mask the time
if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
unsigned int t0 = 0;
tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
} else {
tmp = where((lcoor>=tmin+tshift),tmp,zz);
}
q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
assert(0);
}
NAMESPACE_END(Grid);

View File

@ -1,44 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>;
template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>;
NAMESPACE_END(Grid);

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -0,0 +1,51 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -0,0 +1,51 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -8,8 +8,7 @@
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
@ -32,12 +31,10 @@
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
#include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>;
template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>;
template class WilsonCloverFermion<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1 +0,0 @@
../CompactWilsonCloverFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -0,0 +1,51 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1 +0,0 @@
../CompactWilsonCloverFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -0,0 +1,51 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1,51 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1,51 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1,51 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1,51 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -0,0 +1,51 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -0,0 +1,51 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -18,10 +18,6 @@ WILSON_IMPL_LIST=" \
GparityWilsonImplF \
GparityWilsonImplD "
COMPACT_WILSON_IMPL_LIST=" \
WilsonImplF \
WilsonImplD "
DWF_IMPL_LIST=" \
WilsonImplF \
WilsonImplD \
@ -50,17 +46,7 @@ for impl in $WILSON_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done
CC_LIST="CompactWilsonCloverFermionInstantiation"
for impl in $COMPACT_WILSON_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done
@ -77,14 +63,14 @@ for impl in $DWF_IMPL_LIST $GDWF_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done
# overwrite the .cc file in Gparity directories
for impl in $GDWF_IMPL_LIST
do
ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
done
@ -98,7 +84,7 @@ for impl in $STAG_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done

View File

@ -1,102 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
//--------------------------------------------------------------------
#pragma once
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////
// DDHMC filter with sub-block size B[mu]
////////////////////////////////////////////////////
template<typename GaugeField>
struct DDHMCFilter: public MomentumFilterBase<GaugeField>
{
Coordinate Block;
int Width;
DDHMCFilter(const Coordinate &_Block,int _Width=2): Block(_Block) { Width=_Width; }
void applyFilter(GaugeField &U) const override
{
GridBase *grid = U.Grid();
Coordinate Global=grid->GlobalDimensions();
GaugeField zzz(grid); zzz = Zero();
LatticeInteger coor(grid);
auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
////////////////////////////////////////////////////
// Zero BDY layers
////////////////////////////////////////////////////
std::cout<<GridLogMessage<<" DDHMC Force Filter Block "<<Block<<" width " <<Width<<std::endl;
for(int mu=0;mu<Nd;mu++) {
Integer B1 = Block[mu];
if ( B1 && (B1 <= Global[mu]) ) {
LatticeCoordinate(coor,mu);
////////////////////////////////
// OmegaBar - zero all links contained in slice B-1,0 and
// mu links connecting to Omega
////////////////////////////////
if ( Width==1) {
U = where(mod(coor,B1)==Integer(B1-1),zzz,U);
U = where(mod(coor,B1)==Integer(0) ,zzz,U);
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
if ( Width==2) {
U = where(mod(coor,B1)==Integer(B1-2),zzz,U);
U = where(mod(coor,B1)==Integer(B1-1),zzz,U);
U = where(mod(coor,B1)==Integer(0) ,zzz,U);
U = where(mod(coor,B1)==Integer(1) ,zzz,U);
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
U_mu = where(mod(coor,B1)==Integer(B1-3),zzz_mu,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
if ( Width==3) {
U = where(mod(coor,B1)==Integer(B1-3),zzz,U);
U = where(mod(coor,B1)==Integer(B1-2),zzz,U);
U = where(mod(coor,B1)==Integer(B1-1),zzz,U);
U = where(mod(coor,B1)==Integer(0) ,zzz,U);
U = where(mod(coor,B1)==Integer(1) ,zzz,U);
U = where(mod(coor,B1)==Integer(2) ,zzz,U);
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
U_mu = where(mod(coor,B1)==Integer(B1-4),zzz_mu,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
}
}
}
};
NAMESPACE_END(Grid);

View File

@ -1,71 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
//--------------------------------------------------------------------
#pragma once
NAMESPACE_BEGIN(Grid);
template<typename MomentaField>
struct DirichletFilter: public MomentumFilterBase<MomentaField>
{
typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
typedef iScalar<iScalar<iScalar<vector_type> > > ScalarType; //complex phase for each site
Coordinate Block;
DirichletFilter(const Coordinate &_Block): Block(_Block){}
void applyFilter(MomentaField &P) const override
{
GridBase *grid = P.Grid();
typedef decltype(PeekIndex<LorentzIndex>(P, 0)) LatCM;
////////////////////////////////////////////////////
// Zero strictly links crossing between domains
////////////////////////////////////////////////////
LatticeInteger coor(grid);
LatCM zz(grid); zz = Zero();
for(int mu=0;mu<Nd;mu++) {
if ( (Block[mu]) && (Block[mu] < grid->GlobalDimensions()[mu] ) ) {
// If costly could provide Grid earlier and precompute masks
std::cout << GridLogMessage << " Dirichlet in mu="<<mu<<std::endl;
LatticeCoordinate(coor,mu);
auto P_mu = PeekIndex<LorentzIndex>(P, mu);
P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu);
PokeIndex<LorentzIndex>(P, P_mu, mu);
}
}
}
};
NAMESPACE_END(Grid);

View File

@ -129,10 +129,18 @@ public:
Runner(S);
}
//Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U.
//This is called automatically by Run but may be useful elsewhere, e.g. for integrator tuning experiments
void initializeGaugeFieldAndRNGs(Field &U){
if(!Resources.haveRNGs()) Resources.AddRNGs();
//////////////////////////////////////////////////////////////////
private:
template <class SmearingPolicy>
void Runner(SmearingPolicy &Smearing) {
auto UGrid = Resources.GetCartesian();
Resources.AddRNGs();
Field U(UGrid);
// Can move this outside?
typedef IntegratorType<SmearingPolicy> TheIntegrator;
TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
if (Parameters.StartingType == "HotStart") {
// Hot start
@ -159,25 +167,6 @@ public:
<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
exit(1);
}
}
//////////////////////////////////////////////////////////////////
private:
template <class SmearingPolicy>
void Runner(SmearingPolicy &Smearing) {
auto UGrid = Resources.GetCartesian();
Field U(UGrid);
initializeGaugeFieldAndRNGs(U);
typedef IntegratorType<SmearingPolicy> TheIntegrator;
TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
// Sets the momentum filter
MDynamics.setMomentumFilter(*(Resources.GetMomentumFilter()));
Smearing.set_Field(U);

View File

@ -34,7 +34,6 @@ directory
* @brief Classes for Hybrid Monte Carlo update
*
* @author Guido Cossu
* @author Peter Boyle
*/
//--------------------------------------------------------------------
#pragma once
@ -116,17 +115,22 @@ private:
random(sRNG, rn_test);
std::cout << GridLogHMC << "--------------------------------------------------\n";
std::cout << GridLogHMC << "exp(-dH) = " << prob << " Random = " << rn_test << "\n";
std::cout << GridLogHMC << "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
std::cout << GridLogMessage
<< "--------------------------------------------------\n";
std::cout << GridLogMessage << "exp(-dH) = " << prob
<< " Random = " << rn_test << "\n";
std::cout << GridLogMessage
<< "Acc. Probability = " << ((prob < 1.0) ? prob : 1.0) << "\n";
if ((prob > 1.0) || (rn_test <= prob)) { // accepted
std::cout << GridLogHMC << "Metropolis_test -- ACCEPTED\n";
std::cout << GridLogHMC << "--------------------------------------------------\n";
std::cout << GridLogMessage << "Metropolis_test -- ACCEPTED\n";
std::cout << GridLogMessage
<< "--------------------------------------------------\n";
return true;
} else { // rejected
std::cout << GridLogHMC << "Metropolis_test -- REJECTED\n";
std::cout << GridLogHMC << "--------------------------------------------------\n";
std::cout << GridLogMessage << "Metropolis_test -- REJECTED\n";
std::cout << GridLogMessage
<< "--------------------------------------------------\n";
return false;
}
}
@ -135,68 +139,19 @@ private:
// Evolution
/////////////////////////////////////////////////////////
RealD evolve_hmc_step(Field &U) {
TheIntegrator.refresh(U, sRNG, pRNG); // set U and initialize P and phi's
GridBase *Grid = U.Grid();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Mainly for DDHMC perform a random translation of U modulo volume
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage << "--------------------------------------------------\n";
std::cout << GridLogMessage << "Random shifting gauge field by [";
for(int d=0;d<Grid->Nd();d++) {
int L = Grid->GlobalDimensions()[d];
RealD rn_uniform; random(sRNG, rn_uniform);
int shift = (int) (rn_uniform*L);
std::cout << shift;
if(d<Grid->Nd()-1) std::cout <<",";
else std::cout <<"]\n";
U = Cshift(U,d,shift);
}
std::cout << GridLogMessage << "--------------------------------------------------\n";
TheIntegrator.reset_timer();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// set U and initialize P and phi's
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage << "--------------------------------------------------\n";
std::cout << GridLogMessage << "Refresh momenta and pseudofermions";
TheIntegrator.refresh(U, sRNG, pRNG);
std::cout << GridLogMessage << "--------------------------------------------------\n";
//////////////////////////////////////////////////////////////////////////////////////////////////////
// initial state action
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage << "--------------------------------------------------\n";
std::cout << GridLogMessage << "Compute initial action";
RealD H0 = TheIntegrator.S(U);
std::cout << GridLogMessage << "--------------------------------------------------\n";
RealD H0 = TheIntegrator.S(U); // initial state action
std::streamsize current_precision = std::cout.precision();
std::cout.precision(15);
std::cout << GridLogHMC << "Total H before trajectory = " << H0 << "\n";
std::cout << GridLogMessage << "Total H before trajectory = " << H0 << "\n";
std::cout.precision(current_precision);
std::cout << GridLogMessage << "--------------------------------------------------\n";
std::cout << GridLogMessage << " Molecular Dynamics evolution ";
TheIntegrator.integrate(U);
std::cout << GridLogMessage << "--------------------------------------------------\n";
//////////////////////////////////////////////////////////////////////////////////////////////////////
// updated state action
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage << "--------------------------------------------------\n";
std::cout << GridLogMessage << "Compute final action";
RealD H1 = TheIntegrator.S(U);
std::cout << GridLogMessage << "--------------------------------------------------\n";
RealD H1 = TheIntegrator.S(U); // updated state action
///////////////////////////////////////////////////////////
if(0){
std::cout << "------------------------- Reversibility test" << std::endl;
@ -208,16 +163,17 @@ private:
}
///////////////////////////////////////////////////////////
std::cout.precision(15);
std::cout << GridLogHMC << "--------------------------------------------------\n";
std::cout << GridLogHMC << "Total H after trajectory = " << H1 << " dH = " << H1 - H0 << "\n";
std::cout << GridLogHMC << "--------------------------------------------------\n";
std::cout << GridLogMessage << "Total H after trajectory = " << H1
<< " dH = " << H1 - H0 << "\n";
std::cout.precision(current_precision);
return (H1 - H0);
}
public:
/////////////////////////////////////////
@ -239,13 +195,10 @@ public:
// Actual updates (evolve a copy Ucopy then copy back eventually)
unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory;
for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) {
std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n";
std::cout << GridLogMessage << "-- # Trajectory = " << traj << "\n";
if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) {
std::cout << GridLogHMC << "-- Thermalization" << std::endl;
std::cout << GridLogMessage << "-- Thermalization" << std::endl;
}
double t0=usecond();
@ -254,19 +207,20 @@ public:
DeltaH = evolve_hmc_step(Ucopy);
// Metropolis-Hastings test
bool accept = true;
if (Params.MetropolisTest && traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
if (traj >= Params.StartTrajectory + Params.NoMetropolisUntil) {
accept = metropolis_test(DeltaH);
} else {
std::cout << GridLogHMC << "Skipping Metropolis test" << std::endl;
std::cout << GridLogMessage << "Skipping Metropolis test" << std::endl;
}
if (accept)
Ucur = Ucopy;
double t1=usecond();
std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
std::cout << GridLogMessage << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
TheIntegrator.print_timer();
for (int obs = 0; obs < Observables.size(); obs++) {
std::cout << GridLogDebug << "Observables # " << obs << std::endl;
@ -274,7 +228,7 @@ public:
std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
}
std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
}
}

View File

@ -72,8 +72,6 @@ class HMCResourceManager {
typedef HMCModuleBase< BaseHmcCheckpointer<ImplementationPolicy> > CheckpointerBaseModule;
typedef HMCModuleBase< HmcObservable<typename ImplementationPolicy::Field> > ObservableBaseModule;
typedef ActionModuleBase< Action<typename ImplementationPolicy::Field>, GridModule > ActionBaseModule;
typedef typename ImplementationPolicy::Field MomentaField;
typedef typename ImplementationPolicy::Field Field;
// Named storage for grid pairs (std + red-black)
std::unordered_map<std::string, GridModule> Grids;
@ -82,9 +80,6 @@ class HMCResourceManager {
// SmearingModule<ImplementationPolicy> Smearing;
std::unique_ptr<CheckpointerBaseModule> CP;
// Momentum filter
std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> > Filter;
// A vector of HmcObservable modules
std::vector<std::unique_ptr<ObservableBaseModule> > ObservablesList;
@ -95,7 +90,6 @@ class HMCResourceManager {
bool have_RNG;
bool have_CheckPointer;
bool have_Filter;
// NOTE: operator << is not overloaded for std::vector<string>
// so this function is necessary
@ -107,7 +101,7 @@ class HMCResourceManager {
public:
HMCResourceManager() : have_RNG(false), have_CheckPointer(false), have_Filter(false) {}
HMCResourceManager() : have_RNG(false), have_CheckPointer(false) {}
template <class ReaderClass, class vector_type = vComplex >
void initialize(ReaderClass &Read){
@ -135,7 +129,6 @@ public:
RNGModuleParameters RNGpar(Read);
SetRNGSeeds(RNGpar);
// Observables
auto &ObsFactory = HMC_ObservablesModuleFactory<observable_string, typename ImplementationPolicy::Field, ReaderClass>::getInstance();
Read.push(observable_string);// here must check if existing...
@ -215,16 +208,6 @@ public:
AddGrid(s, Mod);
}
void SetMomentumFilter( MomentumFilterBase<typename ImplementationPolicy::Field> * MomFilter) {
assert(have_Filter==false);
Filter = std::unique_ptr<MomentumFilterBase<typename ImplementationPolicy::Field> >(MomFilter);
have_Filter = true;
}
MomentumFilterBase<typename ImplementationPolicy::Field> *GetMomentumFilter(void) {
if ( !have_Filter)
SetMomentumFilter(new MomentumFilterNone<typename ImplementationPolicy::Field>());
return Filter.get();
}
GridCartesian* GetCartesian(std::string s = "") {
if (s.empty()) s = Grids.begin()->first;
@ -243,9 +226,6 @@ public:
//////////////////////////////////////////////////////
// Random number generators
//////////////////////////////////////////////////////
//Return true if the RNG objects have been instantiated
bool haveRNGs() const{ return have_RNG; }
void AddRNGs(std::string s = "") {
// Couple the RNGs to the GridModule tagged by s

View File

@ -33,6 +33,7 @@ directory
#define INTEGRATOR_INCLUDED
#include <memory>
#include "MomentumFilter.h"
NAMESPACE_BEGIN(Grid);
@ -66,7 +67,6 @@ public:
template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy>
class Integrator {
protected:
typedef typename FieldImplementation::Field MomentaField; //for readability
typedef typename FieldImplementation::Field Field;
@ -119,58 +119,36 @@ protected:
}
} update_P_hireps{};
void update_P(MomentaField& Mom, Field& U, int level, double ep) {
// input U actually not used in the fundamental case
// Fundamental updates, include smearing
for (int a = 0; a < as[level].actions.size(); ++a) {
double start_full = usecond();
Field force(U.Grid());
conformable(U.Grid(), Mom.Grid());
Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
double start_force = usecond();
as[level].actions.at(a)->deriv_timer_start();
as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta
as[level].actions.at(a)->deriv_timer_stop();
std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
auto name = as[level].actions.at(a)->action_name();
if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
force = FieldImplementation::projectForce(force); // Ta for gauge fields
double end_force = usecond();
MomFilter->applyFilter(force);
std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<< std::endl;
DumpSliceNorm("force ",force,Nd-1);
Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x])
Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;
Real force_max = std::sqrt(maxLocalNorm2(force));
Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;
as[level].actions.at(a)->deriv_log(force_abs,force_max);
std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max : " << force_max <<" "<<name<<std::endl;
std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt average : " << impulse_abs <<" "<<name<<std::endl;
std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt max : " << impulse_max <<" "<<name<<std::endl;
Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;;
double end_full = usecond();
double time_full = (end_full - start_full) / 1e3;
double time_force = (end_force - start_force) / 1e3;
std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)" << std::endl;
}
// Force from the other representations
as[level].apply(update_P_hireps, Representations, Mom, U, ep);
MomFilter->applyFilter(Mom);
}
void update_U(Field& U, double ep)
@ -184,12 +162,8 @@ protected:
void update_U(MomentaField& Mom, Field& U, double ep)
{
MomentaField MomFiltered(Mom.Grid());
MomFiltered = Mom;
MomFilter->applyFilter(MomFiltered);
// exponential of Mom*U in the gauge fields case
FieldImplementation::update_field(MomFiltered, U, ep);
FieldImplementation::update_field(Mom, U, ep);
// Update the smeared fields, can be implemented as observer
Smearer.set_Field(U);
@ -232,66 +206,6 @@ public:
const MomentaField & getMomentum() const{ return P; }
void reset_timer(void)
{
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
as[level].actions.at(actionID)->reset_timer();
}
}
}
void print_timer(void)
{
std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::" << std::endl;
std::cout << GridLogMessage << " Refresh cumulative timings "<<std::endl;
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
std::cout << GridLogMessage
<< as[level].actions.at(actionID)->action_name()
<<"["<<level<<"]["<< actionID<<"] "
<< as[level].actions.at(actionID)->refresh_us*1.0e-6<<" s"<< std::endl;
}
}
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
std::cout << GridLogMessage << " Action cumulative timings "<<std::endl;
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
std::cout << GridLogMessage
<< as[level].actions.at(actionID)->action_name()
<<"["<<level<<"]["<< actionID<<"] "
<< as[level].actions.at(actionID)->S_us*1.0e-6<<" s"<< std::endl;
}
}
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
std::cout << GridLogMessage << " Force cumulative timings "<<std::endl;
std::cout << GridLogMessage << "------------------------- "<<std::endl;
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
std::cout << GridLogMessage
<< as[level].actions.at(actionID)->action_name()
<<"["<<level<<"]["<< actionID<<"] "
<< as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl;
}
}
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
std::cout << GridLogMessage << " Force average size "<<std::endl;
std::cout << GridLogMessage << "------------------------- "<<std::endl;
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
std::cout << GridLogMessage
<< as[level].actions.at(actionID)->action_name()
<<"["<<level<<"]["<< actionID<<"] : "
<<" force max " << as[level].actions.at(actionID)->deriv_max_average()
<<" norm " << as[level].actions.at(actionID)->deriv_norm_average()
<<" calls " << as[level].actions.at(actionID)->deriv_num
<< std::endl;
}
}
std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
}
void print_parameters()
{
std::cout << GridLogMessage << "[Integrator] Name : "<< integrator_name() << std::endl;
@ -310,6 +224,7 @@ public:
}
}
std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
}
void reverse_momenta()
@ -352,19 +267,15 @@ public:
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
// get gauge field from the SmearingPolicy and
// based on the boolean is_smeared in actionID
auto name = as[level].actions.at(actionID)->action_name();
std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
as[level].actions.at(actionID)->refresh_timer_start();
as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
as[level].actions.at(actionID)->refresh_timer_stop();
}
// Refresh the higher representation actions
as[level].apply(refresh_hireps, Representations, sRNG, pRNG);
}
MomFilter->applyFilter(P);
}
// to be used by the actionlevel class to iterate
@ -399,9 +310,7 @@ public:
// based on the boolean is_smeared in actionID
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
as[level].actions.at(actionID)->S_timer_start();
Hterm = as[level].actions.at(actionID)->S(Us);
as[level].actions.at(actionID)->S_timer_stop();
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
H += Hterm;
}

View File

@ -55,12 +55,12 @@ public:
}
}
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
GridBase *grid = Umu.Grid();
GaugeMat xform(grid);
SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge);
SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
}
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
GridBase *grid = Umu.Grid();
@ -122,8 +122,6 @@ public:
}
}
std::cout << GridLogError << "Gauge fixing did not converge in " << maxiter << " iterations." << std::endl;
if (err_on_no_converge) assert(0);
};
static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
GridBase *grid = U[0].Grid();

View File

@ -125,6 +125,7 @@ public:
return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
}
//////////////////////////////////////////////////
// average over all x,y,z the temporal loop
//////////////////////////////////////////////////
@ -164,7 +165,7 @@ public:
double vol = Umu.Grid()->gSites();
return p.real() / vol / (4.0 * Nc ) ;
return p.real() / vol / 4.0 / 3.0;
};
//////////////////////////////////////////////////

View File

@ -52,11 +52,6 @@ public:
return arg;
}
};
class SimpleStencilParams{
public:
Coordinate dirichlet;
SimpleStencilParams() {};
};
NAMESPACE_END(Grid);

View File

@ -133,8 +133,6 @@ class CartesianStencilAccelerator {
int _osites;
StencilVector _directions;
StencilVector _distances;
StencilVector _comms_send;
StencilVector _comms_recv;
StencilVector _comm_buf_size;
StencilVector _permute_type;
StencilVector same_node;
@ -228,8 +226,6 @@ public:
void * recv_buf;
Integer to_rank;
Integer from_rank;
Integer do_send;
Integer do_recv;
Integer bytes;
};
struct Merge {
@ -244,20 +240,7 @@ public:
cobj * mpi_p;
Integer buffer_size;
};
struct CopyReceiveBuffer {
void * from_p;
void * to_p;
Integer bytes;
};
struct CachedTransfer {
Integer direction;
Integer OrthogPlane;
Integer DestProc;
Integer bytes;
Integer lane;
Integer cb;
void *recv_buf;
};
protected:
GridBase * _grid;
@ -288,8 +271,7 @@ public:
std::vector<Merge> MergersSHM;
std::vector<Decompress> Decompressions;
std::vector<Decompress> DecompressionsSHM;
std::vector<CopyReceiveBuffer> CopyReceiveBuffers ;
std::vector<CachedTransfer> CachedTransfers;
///////////////////////////////////////////////////////////
// Unified Comms buffers for all directions
///////////////////////////////////////////////////////////
@ -302,6 +284,29 @@ public:
int u_comm_offset;
int _unified_buffer_size;
/////////////////////////////////////////
// Timing info; ugly; possibly temporary
/////////////////////////////////////////
double commtime;
double mpi3synctime;
double mpi3synctime_g;
double shmmergetime;
double gathertime;
double gathermtime;
double halogtime;
double mergetime;
double decompresstime;
double comms_bytes;
double shm_bytes;
double splicetime;
double nosplicetime;
double calls;
std::vector<double> comm_bytes_thr;
std::vector<double> shm_bytes_thr;
std::vector<double> comm_time_thr;
std::vector<double> comm_enter_thr;
std::vector<double> comm_leave_thr;
////////////////////////////////////////
// Stencil query
////////////////////////////////////////
@ -328,12 +333,11 @@ public:
//////////////////////////////////////////
// Comms packet queue for asynch thread
// Use OpenMP Tasks for cleaner ???
// must be called *inside* parallel region
//////////////////////////////////////////
/*
void CommunicateThreaded()
{
#ifdef GRID_OMP
// must be called in parallel region
int mythread = omp_get_thread_num();
int nthreads = CartesianCommunicator::nCommThreads;
#else
@ -342,29 +346,65 @@ public:
#endif
if (nthreads == -1) nthreads = 1;
if (mythread < nthreads) {
comm_enter_thr[mythread] = usecond();
for (int i = mythread; i < Packets.size(); i += nthreads) {
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
comm_bytes_thr[mythread] += bytes;
shm_bytes_thr[mythread] += 2*Packets[i].bytes-bytes; // Send + Recv.
}
comm_leave_thr[mythread]= usecond();
comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
}
}
*/
void CollateThreads(void)
{
int nthreads = CartesianCommunicator::nCommThreads;
double first=0.0;
double last =0.0;
for(int t=0;t<nthreads;t++) {
double t0 = comm_enter_thr[t];
double t1 = comm_leave_thr[t];
comms_bytes+=comm_bytes_thr[t];
shm_bytes +=shm_bytes_thr[t];
comm_enter_thr[t] = 0.0;
comm_leave_thr[t] = 0.0;
comm_time_thr[t] = 0.0;
comm_bytes_thr[t]=0;
shm_bytes_thr[t]=0;
if ( first == 0.0 ) first = t0; // first is t0
if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen
if ( t1 > last ) last = t1; // max time seen
}
commtime+= last-first;
}
////////////////////////////////////////////////////////////////////////
// Non blocking send and receive. Necessarily parallel.
////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{
reqs.resize(Packets.size());
commtime-=usecond();
for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(reqs[i],
Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].bytes,i);
uint64_t bytes=_grid->StencilSendToRecvFromBegin(reqs[i],
Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
comms_bytes+=bytes;
shm_bytes +=2*Packets[i].bytes-bytes;
}
}
@ -373,6 +413,7 @@ public:
for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromComplete(reqs[i],i);
}
commtime+=usecond();
}
////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel.
@ -380,27 +421,28 @@ public:
void Communicate(void)
{
if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
/////////////////////////////////////////////////////////
// several way threaded on different communicators.
// Cannot combine with Dirichlet operators
// This scheme is needed on Intel Omnipath for best performance
// Deprecate once there are very few omnipath clusters
/////////////////////////////////////////////////////////
int nthreads = CartesianCommunicator::nCommThreads;
int old = GridThread::GetThreads();
GridThread::SetThreads(nthreads);
thread_for(i,Packets.size(),{
_grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].bytes,i);
});
GridThread::SetThreads(old);
} else {
/////////////////////////////////////////////////////////
// Concurrent and non-threaded asynch calls to MPI
/////////////////////////////////////////////////////////
thread_region {
// must be called in parallel region
int mythread = thread_num();
int maxthreads= thread_max();
int nthreads = CartesianCommunicator::nCommThreads;
assert(nthreads <= maxthreads);
if (nthreads == -1) nthreads = 1;
if (mythread < nthreads) {
for (int i = mythread; i < Packets.size(); i += nthreads) {
double start = usecond();
uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
comm_bytes_thr[mythread] += bytes;
shm_bytes_thr[mythread] += Packets[i].bytes - bytes;
comm_time_thr[mythread] += usecond() - start;
}
}
}
} else { // Concurrent and non-threaded asynch calls to MPI
std::vector<std::vector<CommsRequest_t> > reqs;
this->CommunicateBegin(reqs);
this->CommunicateComplete(reqs);
@ -442,23 +484,31 @@ public:
sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
if (splice_dim) {
auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx,point);
splicetime-=usecond();
auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx);
is_same_node = is_same_node && tmp;
splicetime+=usecond();
} else {
auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx,point);
nosplicetime-=usecond();
auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx);
is_same_node = is_same_node && tmp;
nosplicetime+=usecond();
}
} else {
if(splice_dim){
splicetime-=usecond();
// if checkerboard is unfavourable take two passes
// both with block stride loop iteration
auto tmp1 = GatherSimd(source,dimension,shift,0x1,compress,face_idx,point);
auto tmp2 = GatherSimd(source,dimension,shift,0x2,compress,face_idx,point);
auto tmp1 = GatherSimd(source,dimension,shift,0x1,compress,face_idx);
auto tmp2 = GatherSimd(source,dimension,shift,0x2,compress,face_idx);
is_same_node = is_same_node && tmp1 && tmp2;
splicetime+=usecond();
} else {
auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx,point);
auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx,point);
nosplicetime-=usecond();
auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx);
auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx);
is_same_node = is_same_node && tmp1 && tmp2;
nosplicetime+=usecond();
}
}
}
@ -468,10 +518,13 @@ public:
template<class compressor>
void HaloGather(const Lattice<vobj> &source,compressor &compress)
{
mpi3synctime_g-=usecond();
_grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime_g+=usecond();
// conformable(source.Grid(),_grid);
assert(source.Grid()==_grid);
halogtime-=usecond();
u_comm_offset=0;
@ -485,6 +538,7 @@ public:
assert(u_comm_offset==_unified_buffer_size);
accelerator_barrier();
halogtime+=usecond();
}
/////////////////////////
@ -497,70 +551,14 @@ public:
Mergers.resize(0);
MergersSHM.resize(0);
Packets.resize(0);
CopyReceiveBuffers.resize(0);
CachedTransfers.resize(0);
calls++;
}
void AddCopy(void *from,void * to, Integer bytes)
{
CopyReceiveBuffer obj;
obj.from_p = from;
obj.to_p = to;
obj.bytes= bytes;
CopyReceiveBuffers.push_back(obj);
}
void CommsCopy()
{
// These are device resident MPI buffers.
for(int i=0;i<CopyReceiveBuffers.size();i++){
cobj *from=(cobj *)CopyReceiveBuffers[i].from_p;
cobj *to =(cobj *)CopyReceiveBuffers[i].to_p;
Integer words = CopyReceiveBuffers[i].bytes/sizeof(cobj);
accelerator_forNB(j, words, cobj::Nsimd(), {
coalescedWrite(to[j] ,coalescedRead(from [j]));
});
}
}
Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,Integer bytes,Integer cb)
{
CachedTransfer obj;
obj.direction = direction;
obj.OrthogPlane = OrthogPlane;
obj.DestProc = DestProc;
obj.recv_buf = recv_buf;
obj.lane = lane;
obj.bytes = bytes;
obj.cb = cb;
for(int i=0;i<CachedTransfers.size();i++){
if ( (CachedTransfers[i].direction ==direction)
&&(CachedTransfers[i].OrthogPlane==OrthogPlane)
&&(CachedTransfers[i].DestProc ==DestProc)
&&(CachedTransfers[i].bytes ==bytes)
&&(CachedTransfers[i].lane ==lane)
&&(CachedTransfers[i].cb ==cb)
){
AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes);
return 1;
}
}
CachedTransfers.push_back(obj);
return 0;
}
void AddPacket(void *xmit,void * rcv,
Integer to, Integer do_send,
Integer from, Integer do_recv,
Integer bytes){
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
Packet p;
p.send_buf = xmit;
p.recv_buf = rcv;
p.to_rank = to;
p.from_rank= from;
p.do_send = do_send;
p.do_recv = do_recv;
p.bytes = bytes;
Packets.push_back(p);
}
@ -580,17 +578,22 @@ public:
mv.push_back(m);
}
template<class decompressor> void CommsMerge(decompressor decompress) {
CommsCopy();
CommsMerge(decompress,Mergers,Decompressions);
}
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
mpi3synctime-=usecond();
_grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime+=usecond();
shmmergetime-=usecond();
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
shmmergetime+=usecond();
}
template<class decompressor>
void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd)
{
void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd) {
mergetime-=usecond();
for(int i=0;i<mm.size();i++){
auto mp = &mm[i].mpointer[0];
auto vp0= &mm[i].vpointers[0][0];
@ -600,7 +603,9 @@ public:
decompress.Exchange(mp,vp0,vp1,type,o);
});
}
mergetime+=usecond();
decompresstime-=usecond();
for(int i=0;i<dd.size();i++){
auto kp = dd[i].kernel_p;
auto mp = dd[i].mpi_p;
@ -608,6 +613,7 @@ public:
decompress.Decompress(kp,mp,o);
});
}
decompresstime+=usecond();
}
////////////////////////////////////////
// Set up routines
@ -644,54 +650,18 @@ public:
}
}
}
/// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block)
{
for(int ii=0;ii<this->_npoints;ii++){
int dimension = this->_directions[ii];
int displacement = this->_distances[ii];
int gd = _grid->_gdimensions[dimension];
int fd = _grid->_fdimensions[dimension];
int pd = _grid->_processors [dimension];
int pc = _grid->_processor_coor[dimension];
int ld = fd/pd;
///////////////////////////////////////////
// Figure out dirichlet send and receive
// on this leg of stencil.
///////////////////////////////////////////
int comm_dim = _grid->_processors[dimension] >1 ;
int block = dirichlet_block[dimension];
this->_comms_send[ii] = comm_dim;
this->_comms_recv[ii] = comm_dim;
if ( block && comm_dim ) {
assert(abs(displacement) < ld );
if( displacement > 0 ) {
// High side, low side
// | <--B--->|
// | | |
// noR
// noS
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_recv[ii] = 0;
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_send[ii] = 0;
} else {
// High side, low side
// | <--B--->|
// | | |
// noS
// noR
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
}
}
}
}
CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances,
Parameters p)
: shm_bytes_thr(npoints),
comm_bytes_thr(npoints),
comm_enter_thr(npoints),
comm_leave_thr(npoints),
comm_time_thr(npoints)
{
face_table_computed=0;
_grid = grid;
@ -705,12 +675,8 @@ public:
this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels
this->_directions = StencilVector(directions);
this->_distances = StencilVector(distances);
this->_comms_send.resize(npoints);
this->_comms_recv.resize(npoints);
this->same_node.resize(npoints);
if ( p.dirichlet.size() ) DirichletBlock(p.dirichlet); // comms send/recv set up
_unified_buffer_size=0;
surface_list.resize(0);
@ -727,16 +693,15 @@ public:
int displacement = distances[i];
int shift = displacement;
int gd = _grid->_gdimensions[dimension];
int fd = _grid->_fdimensions[dimension];
int pd = _grid->_processors [dimension];
int ld = gd/pd;
int rd = _grid->_rdimensions[dimension];
int pc = _grid->_processor_coor[dimension];
this->_permute_type[point]=_grid->PermuteType(dimension);
this->_checkerboard = checkerboard;
//////////////////////////
// the permute type
//////////////////////////
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
@ -745,6 +710,7 @@ public:
assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
int sshift[2];
//////////////////////////
// Underlying approach. For each local site build
// up a table containing the npoint "neighbours" and whether they
@ -845,7 +811,6 @@ public:
GridBase *grid=_grid;
const int Nsimd = grid->Nsimd();
int comms_recv = this->_comms_recv[point];
int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension];
@ -902,32 +867,25 @@ public:
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
wraparound = 1;
}
// Wrap locally dirichlet support case OR node local
if ( offnode==0 ) {
if (!offnode) {
int permute_slice=0;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
} else {
if ( comms_recv==0 ) {
int permute_slice=1;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
} else {
ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase
}
}
if ( offnode ) {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
// int rank = grid->_processor;
// int recv_from_rank;
// int xmit_to_rank;
int unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words;
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
}
}
}
@ -1026,14 +984,11 @@ public:
}
template<class compressor>
int Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx, int point)
int Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx)
{
typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
int comms_send = this->_comms_send[point] ;
int comms_recv = this->_comms_recv[point] ;
assert(rhs.Grid()==_grid);
// conformable(_grid,rhs.Grid());
@ -1056,93 +1011,78 @@ public:
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc) {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * compress.CommDatumSize();
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int comm_off = u_comm_offset;
if ( !face_table_computed ) {
face_table.resize(face_idx+1);
std::vector<std::pair<int,int> > face_table_host ;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host);
face_table[face_idx].resize(face_table_host.size());
acceleratorCopyToDevice(&face_table_host[0],
&face_table[face_idx][0],
face_table[face_idx].size()*sizeof(face_table_host[0]));
}
// int rank = _grid->_processor;
int recv_from_rank;
int xmit_to_rank;
cobj *recv_buf;
cobj *send_buf;
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
assert (xmit_to_rank != _grid->ThisRank());
assert (recv_from_rank != _grid->ThisRank());
if( comms_send ) {
if ( !face_table_computed ) {
face_table.resize(face_idx+1);
std::vector<std::pair<int,int> > face_table_host ;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
face_table[face_idx].resize(face_table_host.size());
acceleratorCopyToDevice(&face_table_host[0],
&face_table[face_idx][0],
face_table[face_idx].size()*sizeof(face_table_host[0]));
}
if ( compress.DecompressionStep() ) {
recv_buf=u_simd_recv_buf[0];
} else {
recv_buf=this->u_recv_buf_p;
}
send_buf = this->u_send_buf_p; // Gather locally, must send
////////////////////////////////////////////////////////
// Gather locally
////////////////////////////////////////////////////////
assert(send_buf!=NULL);
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
cobj *recv_buf;
if ( compress.DecompressionStep() ) {
recv_buf=u_simd_recv_buf[0];
} else {
recv_buf=this->u_recv_buf_p;
}
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask);
if ( (!duplicate) ) { // Force comms for now
cobj *send_buf;
send_buf = this->u_send_buf_p; // Gather locally, must send
///////////////////////////////////////////////////////////
// Build a list of things to do after we synchronise GPUs
// Start comms now???
///////////////////////////////////////////////////////////
AddPacket((void *)&send_buf[comm_off],
(void *)&recv_buf[comm_off],
xmit_to_rank, comms_send,
recv_from_rank, comms_recv,
bytes);
}
if ( compress.DecompressionStep() && comms_recv ) {
AddDecompress(&this->u_recv_buf_p[comm_off],
&recv_buf[comm_off],
////////////////////////////////////////////////////////
// Gather locally
////////////////////////////////////////////////////////
gathertime-=usecond();
assert(send_buf!=NULL);
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
gathertime+=usecond();
///////////////////////////////////////////////////////////
// Build a list of things to do after we synchronise GPUs
// Start comms now???
///////////////////////////////////////////////////////////
AddPacket((void *)&send_buf[u_comm_offset],
(void *)&recv_buf[u_comm_offset],
xmit_to_rank,
recv_from_rank,
bytes);
if ( compress.DecompressionStep() ) {
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
&recv_buf[u_comm_offset],
words,Decompressions);
}
u_comm_offset+=words;
face_idx++;
}
}
return 0;
}
template<class compressor>
int GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx,int point)
int GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
{
const int Nsimd = _grid->Nsimd();
const int maxl =2;// max layout in a direction
int comms_send = this->_comms_send[point] ;
int comms_recv = this->_comms_recv[point] ;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
@ -1157,6 +1097,7 @@ public:
int permute_type=_grid->PermuteType(dimension);
// std::cout << "SimdNew permute type "<<permute_type<<std::endl;
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
@ -1190,9 +1131,8 @@ public:
if ( any_offnode ) {
int comm_off = u_comm_offset;
for(int i=0;i<maxl;i++){
spointers[i] = (cobj *) &u_simd_send_buf[i][comm_off];
spointers[i] = (cobj *) &u_simd_send_buf[i][u_comm_offset];
}
int sx = (x+sshift)%rd;
@ -1201,17 +1141,18 @@ public:
face_table.resize(face_idx+1);
std::vector<std::pair<int,int> > face_table_host ;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host);
face_table[face_idx].resize(face_table_host.size());
acceleratorCopyToDevice(&face_table_host[0],
&face_table[face_idx][0],
face_table[face_idx].size()*sizeof(face_table_host[0]));
}
gathermtime-=usecond();
if ( comms_send )
Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
face_idx++;
gathermtime+=usecond();
//spointers[0] -- low
//spointers[1] -- high
@ -1228,8 +1169,8 @@ public:
int nbr_plane = nbr_ic;
assert (sx == nbr_ox);
auto rp = &u_simd_recv_buf[i ][comm_off];
auto sp = &u_simd_send_buf[nbr_plane][comm_off];
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
auto sp = &u_simd_send_buf[nbr_plane][u_comm_offset];
if(nbr_proc){
@ -1240,13 +1181,8 @@ public:
rpointers[i] = rp;
int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,bytes,cbmask);
if ( !duplicate ) {
AddPacket((void *)sp,(void *)rp,
xmit_to_rank,comms_send,
recv_from_rank,comms_recv,
bytes);
}
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
} else {
@ -1255,12 +1191,9 @@ public:
}
}
if ( comms_recv ) {
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
}
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
u_comm_offset +=buffer_size;
}
}
return 0;

View File

@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
// Specialisation: Cayley-Hamilton exponential for SU(3)
#ifndef GRID_ACCELERATED
#ifndef GRID_CUDA
template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr>
accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP )
{

View File

@ -342,7 +342,7 @@ extern hipStream_t copyStream;
/*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return hipThreadIdx_x;
return hipThreadIdx_z;
#else
return 0;
#endif
@ -356,41 +356,19 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
{ __VA_ARGS__;} \
}; \
int nt=acceleratorThreads(); \
dim3 hip_threads(nsimd, nt, 1); \
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
0,0, \
num1,num2,nsimd, lambda); \
} else { \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0, \
num1,num2,nsimd, lambda); \
} \
dim3 hip_threads(nt,1,nsimd); \
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0, \
num1,num2,nsimd,lambda); \
}
template<typename lambda> __global__
__launch_bounds__(64,1)
void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
{
// Following the same scheme as CUDA for now
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
uint64_t z = threadIdx.x;
if ( (x < numx) && (y<numy) && (z<numz) ) {
Lambda(x,y,z);
}
}
template<typename lambda> __global__
__launch_bounds__(1024,1)
void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
{
// Following the same scheme as CUDA for now
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
uint64_t z = threadIdx.x;
uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
if ( (x < numx) && (y<numy) && (z<numz) ) {
Lambda(x,y,z);
}
@ -441,7 +419,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);
hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
}
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
@ -461,8 +439,6 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#define GRID_ACCELERATED
#endif
//////////////////////////////////////////////
@ -483,10 +459,9 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }

View File

@ -72,20 +72,3 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define thread_region DO_PRAGMA(omp parallel)
#define thread_critical DO_PRAGMA(omp critical)
#ifdef GRID_OMP
inline void thread_bcopy(void *from, void *to,size_t bytes)
{
uint64_t *ufrom = (uint64_t *)from;
uint64_t *uto = (uint64_t *)to;
assert(bytes%8==0);
uint64_t words=bytes/8;
thread_for(w,words,{
uto[w] = ufrom[w];
});
}
#else
inline void thread_bcopy(void *from, void *to,size_t bytes)
{
bcopy(from,to,bytes);
}
#endif

View File

@ -167,13 +167,6 @@ void GridCmdOptionInt(std::string &str,int & val)
return;
}
void GridCmdOptionFloat(std::string &str,float & val)
{
std::stringstream ss(str);
ss>>val;
return;
}
void GridParseLayout(char **argv,int argc,
Coordinate &latt_c,
@ -534,7 +527,6 @@ void Grid_init(int *argc,char ***argv)
void Grid_finalize(void)
{
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
Grid_unquiesce_nodes();
#endif

View File

@ -57,7 +57,6 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
template<class VectorInt>
void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
void GridCmdOptionInt(std::string &str,int & val);
void GridCmdOptionFloat(std::string &str,float & val);
void GridParseLayout(char **argv,int argc,

View File

@ -1,232 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_hmc_EODWFRatio.cc
Copyright (C) 2015-2016
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
int main(int argc, char **argv) {
using namespace Grid;
Grid_init(&argc, &argv);
int threads = GridThread::GetThreads();
// Typedefs to simplify notation
typedef WilsonImplR FermionImplPolicy;
typedef MobiusFermionR FermionAction;
typedef typename FermionAction::FermionField FermionField;
typedef Grid::XmlReader Serialiser;
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
IntegratorParameters MD;
// typedef GenericHMCRunner<LeapFrog> HMCWrapper;
// MD.name = std::string("Leap Frog");
// typedef GenericHMCRunner<ForceGradient> HMCWrapper;
// MD.name = std::string("Force Gradient");
typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
MD.name = std::string("MinimumNorm2");
MD.MDsteps = 4;
MD.trajL = 1.0;
HMCparameters HMCparams;
HMCparams.StartTrajectory = 8;
HMCparams.Trajectories = 200;
HMCparams.NoMetropolisUntil= 0;
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
// HMCparams.StartingType =std::string("ColdStart");
HMCparams.StartingType =std::string("CheckpointStart");
HMCparams.MD = MD;
HMCWrapper TheHMC(HMCparams);
// Grid from the command line arguments --grid and --mpi
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
CheckpointerParameters CPparams;
CPparams.config_prefix = "ckpoint_EODWF_lat";
CPparams.rng_prefix = "ckpoint_EODWF_rng";
CPparams.saveInterval = 1;
CPparams.format = "IEEE64BIG";
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
RNGModuleParameters RNGpar;
RNGpar.serial_seeds = "1 2 3 4 5";
RNGpar.parallel_seeds = "6 7 8 9 10";
TheHMC.Resources.SetRNGSeeds(RNGpar);
// Construct observables
// here there is too much indirection
typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
TheHMC.Resources.AddObservable<PlaqObs>();
//////////////////////////////////////////////
const int Ls = 16;
Real beta = 2.13;
Real light_mass = 0.01;
Real strange_mass = 0.04;
Real pv_mass = 1.0;
RealD M5 = 1.8;
RealD b = 1.0;
RealD c = 0.0;
// FIXME:
// Same in MC and MD
// Need to mix precision too
OneFlavourRationalParams OFRp;
OFRp.lo = 4.0e-3;
OFRp.hi = 30.0;
OFRp.MaxIter = 10000;
OFRp.tolerance= 1.0e-10;
OFRp.degree = 16;
OFRp.precision= 50;
std::vector<Real> hasenbusch({ 0.01, 0.04, 0.2 , pv_mass });
std::vector<bool> dirichlet ({ true, true, true });
auto GridPtr = TheHMC.Resources.GetCartesian();
auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
////////////////////////////////////////////////////////////////
// Domain decomposed
////////////////////////////////////////////////////////////////
Coordinate latt4 = GridPtr->GlobalDimensions();
Coordinate mpi = GridPtr->ProcessorGrid();
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
Coordinate CommDim(Nd);
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
Coordinate Dirichlet(Nd+1,0);
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
Coordinate Block4(Nd);
Block4[0] = Dirichlet[1];
Block4[1] = Dirichlet[2];
Block4[2] = Dirichlet[3];
Block4[3] = Dirichlet[4];
TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplR::Field>(Block4));
//////////////////////////
// Fermion Grid
//////////////////////////
auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
IwasakiGaugeActionR GaugeAction(beta);
// temporarily need a gauge field
LatticeGaugeField U(GridPtr);
// These lines are unecessary if BC are all periodic
std::vector<Complex> boundary = {1,1,1,-1};
FermionAction::ImplParams Params(boundary);
double StoppingCondition = 1e-10;
double MaxCGIterations = 30000;
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
////////////////////////////////////
// Collect actions
////////////////////////////////////
ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(2);
ActionLevel<HMCWrapper::Field> Level3(8);
////////////////////////////////////
// Strange action
////////////////////////////////////
FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params);
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp);
// Level1.push_back(&StrangePseudoFermion);
////////////////////////////////////
// up down action
////////////////////////////////////
std::vector<Real> light_den;
std::vector<Real> light_num;
std::vector<int> dirichlet_den;
std::vector<int> dirichlet_num;
int n_hasenbusch = hasenbusch.size();
light_den.push_back(light_mass);
dirichlet_den.push_back(0);
for(int h=0;h<n_hasenbusch;h++){
light_den.push_back(hasenbusch[h]);
light_num.push_back(hasenbusch[h]);
dirichlet_num.push_back(1);
dirichlet_den.push_back(1);
}
light_num.push_back(pv_mass);
dirichlet_num.push_back(0);
std::vector<FermionAction *> Numerators;
std::vector<FermionAction *> Denominators;
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
for(int h=0;h<n_hasenbusch+1;h++){
std::cout << GridLogMessage << " 2f quotient Action "<< light_num[h]<< " (" << dirichlet_num[h]
<<") / " << light_den[h]<< " (" << dirichlet_den[h]<<")"<< std::endl;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
if ( dirichlet_den[h]==1) Denominators[h]->DirichletBlock(Dirichlet);
if ( dirichlet_num[h]==1) Numerators[h]->DirichletBlock(Dirichlet);
}
int nquo=Quotients.size();
Level1.push_back(Quotients[0]);
Level1.push_back(Quotients[nquo-1]);
for(int h=1;h<nquo-1;h++){
Level2.push_back(Quotients[h]);
}
/////////////////////////////////////////////////////////////
// Gauge action
/////////////////////////////////////////////////////////////
Level3.push_back(&GaugeAction);
TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.TheAction.push_back(Level3);
std::cout << GridLogMessage << " Action complete "<< std::endl;
/////////////////////////////////////////////////////////////
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
TheHMC.Run(); // no smearing
Grid_finalize();
} // main

View File

@ -217,9 +217,9 @@ int main (int argc, char ** argv)
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu][0],
xmit_to_rank,1,
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,1,
recv_from_rank,
bytes,mu);
comm_proc = mpi_layout[mu]-1;
@ -228,9 +228,9 @@ int main (int argc, char ** argv)
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu+4][0],
xmit_to_rank,1,
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,1,
recv_from_rank,
bytes,mu+4);
}
@ -309,9 +309,9 @@ int main (int argc, char ** argv)
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu][0],
xmit_to_rank,1,
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,1,
recv_from_rank,
bytes,mu);
Grid.StencilSendToRecvFromComplete(requests,mu);
requests.resize(0);
@ -322,9 +322,9 @@ int main (int argc, char ** argv)
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu+4][0],
xmit_to_rank,1,
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,1,
recv_from_rank,
bytes,mu+4);
Grid.StencilSendToRecvFromComplete(requests,mu+4);
requests.resize(0);
@ -411,8 +411,8 @@ int main (int argc, char ** argv)
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
int tid = omp_get_thread_num();
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
(void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank, bytes,tid);
thread_critical { dbytes+=tbytes; }
}

View File

@ -32,18 +32,18 @@
using namespace std;
using namespace Grid;
////////////////////////
/// Move to domains ////
////////////////////////
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
template<class d>
struct scal {
d internal;
};
void Benchmark(int Ls, Coordinate Dirichlet);
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
int main (int argc, char ** argv)
{
@ -52,82 +52,24 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads();
Coordinate latt4 = GridDefaultLatt();
int Ls=16;
for(int i=0;i<argc;i++) {
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
}
//////////////////
// With comms
//////////////////
Coordinate Dirichlet(Nd+1,0);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet);
//////////////////
// Domain decomposed
//////////////////
Coordinate latt4 = GridDefaultLatt();
Coordinate mpi = GridDefaultMpi();
Coordinate CommDim(Nd);
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
//////////////////////
// Node level
//////////////////////
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
Benchmark(Ls,Dirichlet);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
Benchmark(Ls,Dirichlet);
Grid_finalize();
exit(0);
}
void Benchmark(int Ls, Coordinate Dirichlet)
{
Coordinate latt4 = GridDefaultLatt();
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
@ -138,9 +80,9 @@ void Benchmark(int Ls, Coordinate Dirichlet)
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionF src (FGrid); random(RNG5,src);
#if 0
@ -158,6 +100,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
src = src*N2;
#endif
LatticeFermionF result(FGrid); result=Zero();
LatticeFermionF ref(FGrid); ref=Zero();
LatticeFermionF tmp(FGrid);
@ -165,31 +108,29 @@ void Benchmark(int Ls, Coordinate Dirichlet)
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
LatticeGaugeFieldF UmuCopy(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
UmuCopy=Umu;
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrixF ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Apply BCs
////////////////////////////////////
Coordinate Block(4);
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
DirichletFilter<LatticeGaugeFieldF> Filter(Block);
Filter.applyFilter(Umu);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
// LatticeGaugeFieldF Umu5d(FGrid);
std::vector<LatticeColourMatrixF> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
@ -249,15 +190,12 @@ void Benchmark(int Ls, Coordinate Dirichlet)
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionF::ImplParams p;
p.dirichlet=Dirichlet;
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
Dw.ImportGauge(Umu);
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =300;
if (1) {
FGrid->Barrier();
Dw.ZeroCounters();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
@ -282,20 +220,29 @@ void Benchmark(int Ls, Coordinate Dirichlet)
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
// std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
// std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
//exit(0);
if(( norm2(err)>1.0e-4) ) {
/*
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
*/
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
exit(-1);
}
assert (norm2(err)< 1.0e-4 );
Dw.Report();
}
if (1)
@ -339,20 +286,21 @@ void Benchmark(int Ls, Coordinate Dirichlet)
}
ref = -0.5*ref;
}
Dw.Dhop(src,result,DaggerYes);
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
// dump=1;
Dw.Dhop(src,result,1);
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
assert((norm2(err)<1.0e-4));
if((norm2(err)>1.0e-4)){
/*
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
*/
}
LatticeFermionF src_e (FrbGrid);
LatticeFermionF src_o (FrbGrid);
LatticeFermionF r_e (FrbGrid);
@ -382,6 +330,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
Dw.ZeroCounters();
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
@ -403,6 +352,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
Dw.Report();
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
@ -417,7 +367,13 @@ void Benchmark(int Ls, Coordinate Dirichlet)
err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert(norm2(err)<1.0e-4);
if((norm2(err)>1.0e-4)){
/*
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
*/
}
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
@ -426,4 +382,6 @@ void Benchmark(int Ls, Coordinate Dirichlet)
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
Grid_finalize();
exit(0);
}

View File

@ -81,8 +81,8 @@ int main (int argc, char ** argv)
Vector<Coeff_t> diag = Dw.bs;
Vector<Coeff_t> upper= Dw.cs;
Vector<Coeff_t> lower= Dw.cs;
upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
lower[0] =-Dw.mass_plus*lower[0];
upper[Ls-1]=-Dw.mass*upper[Ls-1];
lower[0] =-Dw.mass*lower[0];
LatticeFermion r_eo(FGrid);
LatticeFermion src_e (FrbGrid);

View File

@ -44,13 +44,6 @@ void bench_wilson (
double const volume,
int const dag );
void bench_wilson_eo (
LatticeFermion & src,
LatticeFermion & result,
WilsonFermionR & Dw,
double const volume,
int const dag );
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
@ -117,8 +110,8 @@ int main (int argc, char ** argv)
bench_wilson(src,result,Dw,volume,DaggerYes);
std::cout << "\t";
// EO
bench_wilson_eo(src_o,result_e,Dw,volume,DaggerNo);
bench_wilson_eo(src_o,result_e,Dw,volume,DaggerYes);
bench_wilson(src,result,Dw,volume,DaggerNo);
bench_wilson(src,result,Dw,volume,DaggerYes);
std::cout << std::endl;
}
}

View File

@ -159,7 +159,7 @@ case ${ac_ZMOBIUS} in
esac
############### Nc
AC_ARG_ENABLE([Nc],
[AC_HELP_STRING([--enable-Nc=2|3|4|5], [enable number of colours])],
[AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
[ac_Nc=${enable_Nc}], [ac_Nc=3])
case ${ac_Nc} in

View File

@ -93,14 +93,14 @@ template<class Field> class FreeLaplacianStencil : public SparseMatrixBase<Field
{
public:
typedef typename Field::vector_object siteObject;
typedef CartesianStencil<siteObject, siteObject, SimpleStencilParams> StencilImpl;
typedef CartesianStencil<siteObject, siteObject, int> StencilImpl;
GridBase *grid;
StencilImpl Stencil;
SimpleCompressor<siteObject> Compressor;
FreeLaplacianStencil(GridBase *_grid)
: Stencil (_grid,6,Even,directions,displacements,SimpleStencilParams()), grid(_grid)
: Stencil (_grid,6,Even,directions,displacements,0), grid(_grid)
{ };
virtual GridBase *Grid(void) { return grid; };
@ -168,8 +168,7 @@ public:
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef CartesianStencil<siteObject, siteObject,SimpleStencilParams> StencilImpl;
SimpleStencilParams p;
typedef CartesianStencil<siteObject, siteObject, int> StencilImpl;
GridBase *grid;
StencilImpl Stencil;
@ -178,7 +177,7 @@ public:
CovariantLaplacianStencil(GaugeField &Umu)
:
grid(Umu.Grid()),
Stencil (grid,6,Even,directions,displacements,p),
Stencil (grid,6,Even,directions,displacements,0),
Uds(grid)
{
for (int mu = 0; mu < Nd; mu++) {

View File

@ -1,12 +0,0 @@
../../configure --enable-comms=mpi-auto \
--enable-unified=no \
--enable-shm=nvlink \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--enable-simd=GPU \
--disable-fermion-reps \
--disable-gparity \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
HIPFLAGS = --amdgpu-target=gfx90a

View File

@ -1,30 +0,0 @@
#!/bin/bash
# Begin LSF Directives
#SBATCH -A LGT104
#SBATCH -t 01:00:00
##SBATCH -U openmpThu
#SBATCH -J DWF
#SBATCH -o DWF.%J
#SBATCH -e DWF.%J
#SBATCH -N 1
#SBATCH -n 8
#SBATCH --exclusive
#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
DIR=.
module list
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
export MPICH_GPU_SUPPORT_ENABLED=1
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
export MPICH_SMP_SINGLE_COPY_MODE=NONE
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
export OMP_NUM_THREADS=1
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
PARAMS=" --accelerator-threads 16 --grid 32.32.32.256 --mpi 1.1.1.8 --comms-overlap --shm 2048 --shm-mpi 0"
echo $PARAMS
srun --gpus-per-task 1 -n8 ./benchmarks/Benchmark_dwf_fp32 $PARAMS

View File

@ -1,27 +0,0 @@
#!/bin/bash
# Begin LSF Directives
#SBATCH -A LGT104
#SBATCH -t 01:00:00
##SBATCH -U openmpThu
#SBATCH -J DWF
#SBATCH -o DWF.%J
#SBATCH -e DWF.%J
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --exclusive
DIR=.
module list
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
export MPICH_GPU_SUPPORT_ENABLED=1
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
export MPICH_SMP_SINGLE_COPY_MODE=NONE
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
export OMP_NUM_THREADS=4
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
PARAMS=" --accelerator-threads 8 --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
srun --gpus-per-task 1 -n4 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS

View File

@ -1,48 +0,0 @@
#!/bin/bash
# Begin LSF Directives
#SBATCH -A LGT104
#SBATCH -t 01:00:00
##SBATCH -U openmpThu
#SBATCH -J DWF
#SBATCH -o DWF.%J
#SBATCH -e DWF.%J
#SBATCH -N 8
#SBATCH -n 64
#SBATCH --exclusive
#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
DIR=.
module list
export MPICH_OFI_NIC_POLICY=GPU
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
export MPICH_GPU_SUPPORT_ENABLED=1
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
export MPICH_SMP_SINGLE_COPY_MODE=NONE
export OMP_NUM_THREADS=1
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
PARAMS=" --accelerator-threads 16 --grid 64.64.64.256 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
echo $PARAMS
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.256.8node
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 1"
echo $PARAMS
srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 0"
echo $PARAMS
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node.shm0
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1"
echo $PARAMS
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
echo $PARAMS
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0

View File

@ -1,13 +0,0 @@
#!/bin/bash
lrank=$SLURM_LOCALID
lgpu=(0 1 2 3 7 6 5 4)
export ROCR_VISIBLE_DEVICES=${lgpu[$lrank]}
echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES "
$*

View File

@ -1,5 +0,0 @@
module load PrgEnv-gnu
module load rocm/4.5.0
module load gmp
module load cray-fftw
module load craype-accel-amd-gfx90a

View File

@ -1,14 +1,9 @@
DIR=`pwd`
PREFIX=$DIR/../Prequisites/install/
../../configure \
--enable-comms=mpi \
--enable-simd=GPU \
--enable-shm=nvlink \
--enable-gen-simd-width=64 \
--enable-accelerator=cuda \
--enable-setdevice \
--disable-accelerator-cshift \
--with-gmp=$PREFIX \
--disable-fermion-reps \
--disable-unified \
--disable-gparity \

View File

@ -1,27 +1,24 @@
#!/bin/bash
#SBATCH -A m3886_g
#SBATCH -A mp13
#SBATCH -C gpu
#SBATCH -q debug
#SBATCH -q regular
#SBATCH -t 0:20:00
#SBATCH -c 32
#SBATCH -N 1
#SBATCH -n 4
#SBATCH -n 16
#SBATCH --ntasks-per-node=4
#SBATCH --gpus-per-task=1
#SBATCH -c 32
#SBATCH --exclusive
#SBATCH --gpu-bind=none
#SBATCH --gpus-per-task=1
#SBATCH --gpu-bind=map_gpu:0,1,2,3
export SLURM_CPU_BIND="cores"
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_RDMA_ENABLED_CUDA=1
export MPICH_GPU_IPC_ENABLED=1
export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
export MPICH_GPU_NO_ASYNC_MEMCPY=0
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
export MPICH_GPU_SUPPORT_ENABLED=1
srun ./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.2 --accelerator-threads 8 > comms.4node
OPT="--comms-overlap --shm-mpi 1"
VOL=64.64.32.32
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.1.1 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT
#srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.1.1.4 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT
#srun ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.8 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT
OPT="--comms-overlap --comms-concurrent --shm-mpi 0"
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt0
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt0
OPT="--comms-overlap --comms-concurrent --shm-mpi 1"
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt1
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt1

View File

@ -1,4 +1,4 @@
export CRAY_ACCEL_TARGET=nvidia80
module load PrgEnv-gnu cpe-cuda cudatoolkit/11.4
module load PrgEnv-gnu cpe-cuda cuda

View File

@ -6,8 +6,6 @@
--enable-simd=GPU \
--disable-fermion-reps \
--disable-gparity \
--with-gmp=$OLCF_GMP_ROOT \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I/opt/rocm-4.3.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
--prefix=/ccs/home/chulwoo/Grid \

View File

@ -1,7 +1,8 @@
#!/bin/bash
# Begin LSF Directives
#SBATCH -A LGT104
#SBATCH -t 3:00:00
#SBATCH -t 01:00:00
##SBATCH -U openmpThu
#SBATCH -p ecp
#SBATCH -J DWF
#SBATCH -o DWF.%J
@ -13,12 +14,13 @@ DIR=.
module list
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=CMA
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
export MPICH_SMP_SINGLE_COPY_MODE=NONE
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
export OMP_NUM_THREADS=8
AT=8
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
PARAMS=" --accelerator-threads ${AT} --grid 16.16.16.48 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
srun -N2 -n8 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./HMC/Mobius2p1f_DD_RHMC $PARAMS
PARAMS=" --accelerator-threads ${AT} --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
srun -n8 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS

View File

@ -1,9 +1,5 @@
module load emacs
module load PrgEnv-gnu
module load rocm/4.5.0
module load rocm/4.3.0
module load gmp
module load cray-fftw
module load craype-accel-amd-gfx908
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
export MPICH_GPU_SUPPORT_ENABLED=1
export LD_LIBRARY_PATH=/opt/cray/pe/gcc/mpfr/3.1.4/lib/:$LD_LIBRARY_PATH

Some files were not shown because too many files have changed in this diff Show More